In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')

In [76]:
print(df.head())

      id    target                                       comment_text  \
0  59848  0.000000  This is so cool. It's like, 'would you want yo...   
1  59849  0.000000  Thank you!! This would make my life a lot less...   
2  59852  0.000000  This is such an urgent design problem; kudos t...   
3  59855  0.000000  Is this something I'll be able to install on m...   
4  59856  0.893617               haha you guys are a bunch of losers.   

   severe_toxicity  obscene  identity_attack   insult  threat  asian  atheist  \
0         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
1         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
2         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
3         0.000000      0.0         0.000000  0.00000     0.0    NaN      NaN   
4         0.021277      0.0         0.021277  0.87234     0.0    0.0      0.0   

   ...  article_id    rating  funny  wow  sad  likes  disagree  \
0  ...  

In [77]:
print(df.columns.tolist())

['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit', 'identity_annotator_count', 'toxicity_annotator_count']


In [78]:
filtered_df = df[['comment_text', 'target']]

In [79]:
filtered_df = filtered_df[(filtered_df['target'] == 0) | (filtered_df['target'] >= 0.5)]

In [80]:
filtered_df['target'] = filtered_df['target'].apply(lambda x: 1 if x >= 0.5 else 0)

In [81]:
filtered_df_class_count = filtered_df["target"].value_counts()

In [82]:
print(filtered_df_class_count / sum(filtered_df_class_count))

0    0.89757
1    0.10243
Name: target, dtype: float64


In [83]:
X = filtered_df[['comment_text']]
print(X.head())

                                        comment_text
0  This is so cool. It's like, 'would you want yo...
1  Thank you!! This would make my life a lot less...
2  This is such an urgent design problem; kudos t...
3  Is this something I'll be able to install on m...
4               haha you guys are a bunch of losers.


In [84]:
y = filtered_df[['target']]
print(y.head())

   target
0       0
1       0
2       0
3       0
4       1


In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [86]:
train_count = y_train.value_counts()
test_count = y_test.value_counts()
print(train_count / sum(train_count))
print(test_count / sum(test_count))

target
0         0.897618
1         0.102382
dtype: float64
target
0         0.897378
1         0.102622
dtype: float64


In [87]:
train_df = pd.merge(X_train,y_train,left_index=True,right_index=True)
print(train_df)

                                              comment_text  target
1082526  The biggest winner so far is  Canada's Nationa...       0
509740   That is true, but it also depends on how you i...       0
1024647  I'm not sure how you could spin this to find s...       0
887667   "this President has enacted more legislation a...       0
565579   Relax, the ban is TEMPORARY for 90 days only\n...       0
...                                                    ...     ...
138097   As has been pointed out many times by people f...       0
329105   "So, adhering to the mantra that you are who y...       0
165711   Yeah, people move to Oregon all right...about ...       0
856650   Richard, if its the "best kept secret" how cou...       0
153070   Since Hynix' maintenance is not paid for by La...       0

[1127278 rows x 2 columns]


In [88]:
test_df = pd.merge(X_test,y_test,left_index=True,right_index=True)
print(test_df)

                                              comment_text  target
658847   Here's the latest numbers from the 2016 Presid...       0
476462   I had a conversation with Wally in 1967 about ...       0
888134   If you are arguing for Parliamentary Privilege...       0
1225856  Keeping cargo in the line up is kind of like h...       0
151020              "Drop the check and back away slowly!"       0
...                                                    ...     ...
500435   I'd love to see Bautista have a great year, if...       0
276388                       It ain’t over till it’s over.       0
1237137  More like not incur more debt with little in t...       0
1293274  That is all that you could find? An obtuse fal...       0
1724683  I wouldn't be surprised if there are firefight...       0

[281820 rows x 2 columns]


In [89]:
train_count

target
0         1011865
1          115413
dtype: int64

In [90]:
test_count

target
0         252899
1          28921
dtype: int64

In [91]:
balanced_train_df = train_df.groupby("target").sample(n=train_count[1], random_state=12) # take even split

In [92]:
balanced_train_df = balanced_train_df.sample(frac=1) # shuffle

In [93]:
balanced_train_df["target"].value_counts() # count number in training split

1    115413
0    115413
Name: target, dtype: int64

In [94]:
print(balanced_train_df.head())

                                              comment_text  target
1629799  America has no future with h. clinton around. ...       1
284550   I was just telling a client about the reason w...       0
1362542         Coming from a war criminal it means a lot.       0
483833   So  Anka's explanation is 'I need the money'. ...       0
79591    Well, Money talks. On the other Hand is there ...       0


In [95]:
print(test_df.head())

                                              comment_text  target
658847   Here's the latest numbers from the 2016 Presid...       0
476462   I had a conversation with Wally in 1967 about ...       0
888134   If you are arguing for Parliamentary Privilege...       0
1225856  Keeping cargo in the line up is kind of like h...       0
151020              "Drop the check and back away slowly!"       0


In [96]:
balanced_train_df.to_csv("processed_train.csv",index=False)  

In [97]:
test_df.to_csv("processed_test.csv",index=False)  