In [1]:
# Imports PIL module 
import pandas as pd
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
PATH_DATA = './data/data_nlp/'
AD1_FILE = PATH_DATA + 'MeTooHate.csv'
CHUNK_SIZE = 1000

df = pd.read_csv(AD1_FILE)

In [3]:
df.drop(columns=['status_id', 'created_at', 'favorite_count', 'retweet_count',
       'location', 'followers_count', 'friends_count', 'statuses_count',
       ], inplace=True)

df.dropna(inplace=True)
df.shape


(803638, 2)

In [4]:
df.isna().sum()

text        0
category    0
dtype: int64

In [5]:
nlp = spacy.load('en_core_web_sm')
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
import re

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r"", text)

def text_data_cleaning(sentence):
    sentence = remove_url(sentence)
    sentence = remove_html(sentence)
    sentence = remove_emoji(sentence)
    doc = nlp(sentence)

    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in nlp.Defaults.stop_words and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
text_data_cleaning(df.text[3])

['yep',
 'like',
 'triffele',
 'woman',
 'weaponize',
 'poon',
 'wonder',
 'kamala',
 'harris',
 'extort',
 'willy',
 'brown',
 'throw',
 'poon',
 'oh',
 'yeh',
 'job',
 'joke']

In [8]:
df.text[3]

'Yep just like triffeling women weaponized their poon!! Wonder if Kamala Harris ever extorted Willy Brown after throwing the poon on him, oh yeh, that how she got her first job me too is a JOKE! '

In [9]:
remove_emoji(df.text[9])

" Isn't it nice that you know everything?       \n      "

***TFIDF***

In [10]:
from sklearn.svm import LinearSVC

In [37]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC(verbose=True)

In [38]:
X = df.text
y = df.category

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

In [40]:
X_train.shape, X_test.shape

((642910,), (160728,))

In [41]:
X_train[524689]

"so... abusing women??? I guess karma is a bitch, huh!!! Remember your mantra...!!  So, those rules should serve you well, right??!! Good luck Creepy porn lawyer.. couldn't have happened to a nicer guy!! JERK"

In [42]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)], verbose=True)

In [43]:
clf.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing tfidf, total=111.5min
[LibLinear].....*
optimization finished, #iter = 55
Objective value = -106844.563668
nSV = 298712
[Pipeline] ............... (step 2 of 2) Processing clf, total=  14.5s


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f8dbb262280>)),
                ('clf', LinearSVC(verbose=True))],
         verbose=True)

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    141532
           1       0.88      0.59      0.71     19196

    accuracy                           0.94    160728
   macro avg       0.91      0.79      0.84    160728
weighted avg       0.94      0.94      0.94    160728



In [46]:
confusion_matrix(y_test, y_pred)

array([[140001,   1531],
       [  7799,  11397]])

In [76]:
[X_train[524689]]
clf.predict(['love women'])

array([0])

In [71]:
import pickle
filename = './data/data_nlp/classifier.sav'
pickle.dump(clf, open(filename, 'wb'))

In [70]:
df

Unnamed: 0,text,category
0,"Entitled, obnoxious, defensive, lying weasel. ...",0
1,Thank you and for what you did for the women...,0
2,Knitting (s) &amp; getting ready for January 1...,0
3,Yep just like triffeling women weaponized thei...,1
4,"No, the President wants to end movement posin...",0
...,...,...
807169,Let’s not forget that this “iconic kiss” was u...,0
807170,DEFINITELY....the only one any of us should su...,0
807171,Did the movement count the dollars of Erin An...,0
807172,This is one of my all time fav songs &amp; vid...,1


***PREDICTIONS***

In [79]:
year = 2016 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + f'quotes-{year}-filtered.json.bz2'
CHUNK_SIZE = 500

reader = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', chunksize=CHUNK_SIZE, typ='frame')

chunks = [] #utile pour plus loins quand on fait le feature extraction
i=0
for chunk in reader:
    df_0 = chunk
    break
df_0.quotation[0]

"[ Malia ] knows what she is going to do. They have a plan for her and her family feels comfortable knowing that it's not something unstructured,"

In [103]:
# CREATE MONTHLY QUOTATION WHICH SPEAK ABOUT METOO

import bz2
import json
from time import time

PATH_DATA = './data/data_nlp/'
path_to_quotes= PATH_DATA + f'quotes-{year}-filtered.json.bz2'


from helper import people_dict, metoo_dict, action_dict
keywords = people_dict\
          .union(metoo_dict)\
          .union(action_dict)
years = [2020]
start = time()
for year in years:
    #path_to_quotes = f'./data/Quotebank_processed/Quotebank/quotes-{year}-filtered.json.bz2'
    path_to_quotes= PATH_DATA + f'quotes-{year}-filtered.json.bz2'
    begin = time()
    with bz2.open(path_to_quotes, 'rb') as s_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            month = instance['date'][5:7]
            path_per_month = PATH_DATA + f'metoo_{year}_{month}.json.bz2'
            
            quote = str(instance['quotation']).lower()
            with bz2.open(path_per_month, 'ab') as d_file:
                for word in keywords:
                    if word in f' {quote} ':
                        d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                        break
    print(f'year {year} have been procesed in {time()-begin} [s]')

print(f'full data have been processed in {time()-start} [s]')

import pandas as pd

year = 2020
month = 3
if month < 10:
    path_per_month = PATH_DATA + f'metoo_{year}_0{month}.json.bz2'
else:
    path_per_month = PATH_DATA + f'/metoo_{year}_{month}.json.bz2'
    
df = pd.read_json(path_per_month, lines=True, compression='bz2')
df

year 2020 have been procesed in 134.20068502426147 [s]
full data have been processed in 134.2030429840088 [s]


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2020-03-10-007520,Beneath those those long cassocks -- whether b...,Carlo Poggioli,[Q5041579],2020-03-10 14:03:54,1,"[[Carlo Poggioli, 0.915], [Prince of Wales, 0....",[http://www.vulture.com/2020/03/the-new-pope-h...,E
1,2020-03-04-008310,But in practice giving a claim of `sex games g...,,[],2020-03-04 11:02:44,1,"[[None, 0.8985], [Harriet Harman, 0.0631], [Ma...",[https://www.independent.co.uk/life-style/wome...,E
2,2020-03-03-010183,cultural movement centered on a set of social ...,Katherine Stewart,"[Q21466308, Q43390345, Q43392594, Q6376550]",2020-03-03 00:00:00,2,"[[Katherine Stewart, 0.6689], [None, 0.3312]]",[https://www.rawstory.com/2020/03/hot-topics-i...,E
3,2020-03-29-024263,The Diocese followed its established procedure...,,[],2020-03-29 21:54:00,1,"[[None, 0.8938], [James Walsh, 0.1062]]",[https://www.timesleader.com/news/778191/pries...,E
4,2020-03-05-053457,Predators who actually travel in an attempt to...,,[],2020-03-05 18:05:02,1,"[[None, 0.5368], [J. Douglas, 0.4632]]",[http://chattanoogan.com/2020/3/5/405198/Man-W...,E
...,...,...,...,...,...,...,...,...,...
6366,2020-03-07-019853,It's no surprise that Democrat logic in 2020 m...,,[],2020-03-07 15:53:23,3,"[[None, 0.8521], [Steny Hoyer, 0.0417], [Laure...",[https://pjmedia.com/trending/majority-of-demo...,E
6367,2020-03-18-029792,"If I do get the chance to be in the kitchen, I...",,[],2020-03-18 12:21:21,1,"[[None, 0.5954], [Fred Sirieix, 0.4046]]",[http://www.hellomagazine.com/brides/202003188...,E
6368,2020-03-10-013870,Everyone speaks out says how sorry they are wh...,,[],2020-03-10 15:00:21,3,"[[None, 0.9541], [Emily Sears, 0.0459]]",[http://greenwichtime.com/entertainment/articl...,E
6369,2020-03-09-039253,Men and women are much more likely to be satis...,Robin Milhausen,[Q7352658],2020-03-09 11:00:31,4,"[[Robin Milhausen, 0.8862], [None, 0.1138]]",[https://calgarysun.com/life/sex-files/0309-se...,E


array([0])

In [107]:
df['Hatred'] = df.quotation.apply(lambda quote: clf.predict([quote]))

In [111]:
df

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,Hatred
0,2020-03-10-007520,Beneath those those long cassocks -- whether b...,Carlo Poggioli,[Q5041579],2020-03-10 14:03:54,1,"[[Carlo Poggioli, 0.915], [Prince of Wales, 0....",[http://www.vulture.com/2020/03/the-new-pope-h...,E,[1]
1,2020-03-04-008310,But in practice giving a claim of `sex games g...,,[],2020-03-04 11:02:44,1,"[[None, 0.8985], [Harriet Harman, 0.0631], [Ma...",[https://www.independent.co.uk/life-style/wome...,E,[0]
2,2020-03-03-010183,cultural movement centered on a set of social ...,Katherine Stewart,"[Q21466308, Q43390345, Q43392594, Q6376550]",2020-03-03 00:00:00,2,"[[Katherine Stewart, 0.6689], [None, 0.3312]]",[https://www.rawstory.com/2020/03/hot-topics-i...,E,[0]
3,2020-03-29-024263,The Diocese followed its established procedure...,,[],2020-03-29 21:54:00,1,"[[None, 0.8938], [James Walsh, 0.1062]]",[https://www.timesleader.com/news/778191/pries...,E,[0]
4,2020-03-05-053457,Predators who actually travel in an attempt to...,,[],2020-03-05 18:05:02,1,"[[None, 0.5368], [J. Douglas, 0.4632]]",[http://chattanoogan.com/2020/3/5/405198/Man-W...,E,[0]
...,...,...,...,...,...,...,...,...,...,...
6366,2020-03-07-019853,It's no surprise that Democrat logic in 2020 m...,,[],2020-03-07 15:53:23,3,"[[None, 0.8521], [Steny Hoyer, 0.0417], [Laure...",[https://pjmedia.com/trending/majority-of-demo...,E,[0]
6367,2020-03-18-029792,"If I do get the chance to be in the kitchen, I...",,[],2020-03-18 12:21:21,1,"[[None, 0.5954], [Fred Sirieix, 0.4046]]",[http://www.hellomagazine.com/brides/202003188...,E,[0]
6368,2020-03-10-013870,Everyone speaks out says how sorry they are wh...,,[],2020-03-10 15:00:21,3,"[[None, 0.9541], [Emily Sears, 0.0459]]",[http://greenwichtime.com/entertainment/articl...,E,[1]
6369,2020-03-09-039253,Men and women are much more likely to be satis...,Robin Milhausen,[Q7352658],2020-03-09 11:00:31,4,"[[Robin Milhausen, 0.8862], [None, 0.1138]]",[https://calgarysun.com/life/sex-files/0309-se...,E,[1]


In [129]:
for i in range(20):
    print(df[df.Hatred == 1].quotation.iloc[i] + '\n')

Beneath those those long cassocks -- whether black for priests, red for cardinals, or white for popes -- there are real human beings, who, in spite of their vow of chastity to God, still have their passions, loves, and sexual desires, repressed out of love for the rules of the Church in an acceptance of the sense of `sin' that the Church imposed since its official establishment,

Were you ever forced to have sex when you did not want to

I think that today in the former Soviet Union countries the word feminism carries some negative connotations and I'd like to say that feminism is not about just fighting for women's rights, it's also about combating major issues such as domestic violence, child marriage, bride kidnapping, so I stand for every person's rights to be fully respected,

My friend, a stylist, is best friends with Sophie Wessex, and Sophie buys from us. She got our [ £ 295 ] black Sadie jumpsuit for herself and then asked for one for Kate. She said: `Kate wants one, can I get