In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

  from pandas.core import datetools


In [7]:
train = pd.read_csv('data/train.csv')

In [74]:
predict_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

X, y = train.comment_text, train[predict_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

for s in X_train, X_test, y_train, y_test:
    print(s.shape)

(127656,)
(31915,)
(127656, 6)
(31915, 6)


In [75]:
y_test

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,0,0,0,0,0,0
131631,0,0,0,0,0,0
125326,0,0,0,0,0,0
111256,0,0,0,0,0,0
83590,0,0,0,0,0,0
37546,0,0,0,0,0,0
98371,0,0,0,0,0,0
67118,0,0,0,0,0,0
129625,0,0,0,0,0,0
48941,0,0,0,0,0,0


In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [46]:
sid = SentimentIntensityAnalyzer()

In [19]:
negs = X_train.apply(lambda x: sid.polarity_scores(x)['neg'])

In [20]:
X_train

140030    Grandma Terri Should Burn in Trash \nGrandma T...
159124    , 9 May 2009 (UTC)\nIt would be easiest if you...
60006     "\n\nThe Objectivity of this Discussion is dou...
65432                 Shelly Shock\nShelly Shock is. . .( )
154979    I do not care. Refer to Ong Teng Cheong talk p...
3235      "Archive 8: February 2010 - November 2010\n\n ...
83781                 I assume all the subpages of ''' talk
76712       REDIRECT Talk:Croatian Democratic Peasant Party
2874      EDITING/DELETING\n\nHi, im user:CJ2005B. I don...
92630     A few writing tips \n\nMay I suggest you read ...
106261    "\n\nI have changed the headers to small lette...
138019    "\n\nI understand your reasoning but cannot ac...
39598     Fair enough. The standards have change through...
92954     "\nNo, it's a serious breach of policy, he has...
55743     The homeopathic view , the naturopathic view ,...
63657     If this becomes an initiative, I recommend set...
91107                               15:1

In [22]:
y_train['negs'] = negs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [44]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,negs
140030,1,0,0,0,0,0,0.368
159124,0,0,0,0,0,0,0.036
60006,0,0,0,0,0,0,0.098
65432,0,0,0,0,0,0,0.565
154979,0,0,0,0,0,0,0.145
3235,0,0,0,0,0,0,0.000
83781,0,0,0,0,0,0,0.000
76712,0,0,0,0,0,0,0.000
2874,0,0,0,0,0,0,0.000
92630,0,0,0,0,0,0,0.000


# Apply cutoffs

In [32]:
cutoff = 0.06
tempdf = y_train.copy()

predicted_columns = ['negs']

for pc in predicted_columns:
    tempdf[pc] = tempdf[pc].apply(lambda x: 1 - cutoff if x > (1 - cutoff) else ( cutoff if (x < cutoff)  else x))
tempdf

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,negs
140030,1,0,0,0,0,0,0.368
159124,0,0,0,0,0,0,0.060
60006,0,0,0,0,0,0,0.098
65432,0,0,0,0,0,0,0.565
154979,0,0,0,0,0,0,0.145
3235,0,0,0,0,0,0,0.060
83781,0,0,0,0,0,0,0.060
76712,0,0,0,0,0,0,0.060
2874,0,0,0,0,0,0,0.060
92630,0,0,0,0,0,0,0.060


In [34]:
# tempdf.to_csv('data/y_train_with_neg.csv')

# Vader negativity to linear

In [36]:
X = tempdf.negs
ys = [tempdf.iloc[:, i] for i in range(6)]

In [40]:
regressions = []
for y in ys:
    regression = sm.OLS(y, sm.add_constant(X)).fit()
    regressions.append(regression)

In [41]:
regressions[0].summary()

0,1,2,3
Dep. Variable:,toxic,R-squared:,0.24
Model:,OLS,Adj. R-squared:,0.24
Method:,Least Squares,F-statistic:,40240.0
Date:,"Sat, 27 Jan 2018",Prob (F-statistic):,0.0
Time:,19:44:24,Log-Likelihood:,-7551.3
No. Observations:,127656,AIC:,15110.0
Df Residuals:,127654,BIC:,15130.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0606,0.001,-57.108,0.000,-0.063,-0.058
negs,1.4156,0.007,200.595,0.000,1.402,1.429

0,1,2,3
Omnibus:,59120.677,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,313553.039
Skew:,2.241,Prob(JB):,0.0
Kurtosis:,9.234,Cond. No.,9.94


In [42]:
regressions[1].params

const   -0.023159
negs     0.299917
dtype: float64

In [43]:
tempdf.negs

140030    0.368
159124    0.060
60006     0.098
65432     0.565
154979    0.145
3235      0.060
83781     0.060
76712     0.060
2874      0.060
92630     0.060
106261    0.060
138019    0.066
39598     0.060
92954     0.207
55743     0.060
63657     0.060
91107     0.060
70742     0.206
13676     0.060
50777     0.120
151092    0.147
154239    0.060
116118    0.060
37601     0.407
120838    0.060
49961     0.060
145489    0.060
81692     0.060
634       0.085
64857     0.060
          ...  
65725     0.060
123855    0.211
2747      0.140
130523    0.119
149503    0.060
122537    0.086
84478     0.110
156730    0.060
130608    0.060
85305     0.588
103355    0.060
5311      0.062
64925     0.138
59735     0.060
769       0.096
64820     0.212
67221     0.060
41090     0.060
16023     0.060
126324    0.060
112727    0.060
87498     0.157
137337    0.060
54886     0.060
110268    0.065
119879    0.060
103694    0.060
131932    0.060
146867    0.315
121958    0.180
Name: negs, Length: 1276

In [48]:
test_negs = X_test.apply(lambda x: sid.polarity_scores(x)['neg'])

In [51]:
test_negs.shape

(31915,)

In [53]:
i = 0
preds = []
for i in range(6):
    pred = (test_negs * regressions[i].params[1]) + regressions[i].params[0]
    pred = pred.apply(lambda x: max(0, min(1, x)))
    preds.append(pred)
preds

[119105    0.000000
 131631    0.000000
 125326    0.529771
 111256    0.079595
 83590     0.000000
 37546     0.000000
 98371     0.000000
 67118     0.000000
 129625    0.000000
 48941     0.000000
 46117     0.034294
 20136     0.000000
 97862     0.000000
 10245     0.000000
 140013    0.018722
 146861    0.199925
 30404     0.174443
 12000     0.000000
 41916     0.000000
 38974     0.122065
 70064     0.000000
 25255     0.596306
 99638     0.000000
 2783      0.000000
 109183    0.014475
 96229     0.000000
 128796    0.628866
 103592    0.000000
 56415     0.000000
 141058    0.140468
             ...   
 101839    0.056945
 38456     0.000000
 22265     0.239563
 85649     0.005981
 102624    0.150377
 106764    0.066854
 16277     0.436338
 96694     0.243810
 151947    0.049867
 156444    0.255135
 50278     0.231069
 101362    0.052698
 54996     0.000000
 95498     0.199925
 117566    0.086673
 67374     0.000000
 142761    0.000000
 35551     0.047035
 2654      0.116402


In [54]:
predsdf = pd.DataFrame(preds).T
predsdf.columns = ['toxic', 'severe_toxic', 'obscene', 'threat',
                   'insult', 'identity_hate']
predsdf

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
131631,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
125326,0.529771,0.101906,0.366235,0.025759,0.341154,0.058743
111256,0.079595,0.006533,0.040995,0.002317,0.038108,0.006827
83590,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
37546,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
98371,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
67118,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
129625,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
48941,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Apply cutoffs

In [57]:
cutoff = 0.04
tempdf = predsdf.copy()
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for c in columns:
    tempdf[c] = tempdf[c].apply(lambda x: 1 - cutoff if x > (1 - cutoff) else ( cutoff if (x < cutoff)  else x))
tempdf

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
131631,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
125326,0.529771,0.101906,0.366235,0.04,0.341154,0.058743
111256,0.079595,0.040000,0.040995,0.04,0.040000,0.040000
83590,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
37546,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
98371,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
67118,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
129625,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
48941,0.040000,0.040000,0.040000,0.04,0.040000,0.040000


## Write to csv

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,0,0,0,0,0,0
131631,0,0,0,0,0,0
125326,0,0,0,0,0,0
111256,0,0,0,0,0,0
83590,0,0,0,0,0,0
37546,0,0,0,0,0,0
98371,0,0,0,0,0,0
67118,0,0,0,0,0,0
129625,0,0,0,0,0,0
48941,0,0,0,0,0,0


In [76]:
y_test['predicted_toxic'] = tempdf.toxic
y_test['predicted_severe_toxic'] = tempdf.severe_toxic
y_test['predicted_obscene'] = tempdf.obscene
y_test['predicted_threat'] = tempdf.threat
y_test['predicted_insult'] = tempdf.insult
y_test['predicted_identity_hate'] = tempdf.identity_hate

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [77]:
y_test

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,predicted_toxic,predicted_severe_toxic,predicted_obscene,predicted_threat,predicted_insult,predicted_identity_hate
119105,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
131631,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
125326,0,0,0,0,0,0,0.529771,0.101906,0.366235,0.04,0.341154,0.058743
111256,0,0,0,0,0,0,0.079595,0.040000,0.040995,0.04,0.040000,0.040000
83590,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
37546,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
98371,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
67118,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
129625,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000
48941,0,0,0,0,0,0,0.040000,0.040000,0.040000,0.04,0.040000,0.040000


In [78]:

y_test.to_csv('neg_train_results2.csv', index=False)

# Vader Negativity

In [10]:
test = pd.read_csv('data/test.csv')

In [12]:
test_negs = test.comment_text.apply(lambda x: sid.polarity_scores(x)['neg'])

In [13]:
test['negs'] = test_negs

In [14]:
test.to_csv('data/test_with_neg.csv')

# Vader Positivity

In [16]:
test_negs = test.comment_text.apply(lambda x: sid.polarity_scores(x)['pos'])

In [None]:
test['negs'] = test_negs

# Vader Neutral 