In [1]:
# Imports PIL module 
import pandas as pd
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
PATH_DATA = './data/data_nlp/'
AD1_FILE = PATH_DATA + 'MeTooHate.csv'
CHUNK_SIZE = 1000

df = pd.read_csv(AD1_FILE)

In [3]:
df.drop(columns=['status_id', 'created_at', 'favorite_count', 'retweet_count',
       'location', 'followers_count', 'friends_count', 'statuses_count',
       ], inplace=True)

df.dropna(inplace=True)
df.shape


(803638, 2)

In [4]:
df.isna().sum()

text        0
category    0
dtype: int64

In [5]:
nlp = spacy.load('en_core_web_sm')
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
import re

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r"", text)

def text_data_cleaning(sentence):
    sentence = remove_url(sentence)
    sentence = remove_html(sentence)
    sentence = remove_emoji(sentence)
    doc = nlp(sentence)

    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in nlp.Defaults.stop_words and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
text_data_cleaning(df.text[3])

['yep',
 'like',
 'triffele',
 'woman',
 'weaponize',
 'poon',
 'wonder',
 'kamala',
 'harris',
 'extort',
 'willy',
 'brown',
 'throw',
 'poon',
 'oh',
 'yeh',
 'job',
 'joke']

In [8]:
df.text[3]

'Yep just like triffeling women weaponized their poon!! Wonder if Kamala Harris ever extorted Willy Brown after throwing the poon on him, oh yeh, that how she got her first job me too is a JOKE! '

In [9]:
remove_emoji(df.text[9])

" Isn't it nice that you know everything?       \n      "

***TFIDF***

In [10]:
from sklearn.svm import LinearSVC

In [37]:
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC(verbose=True)

In [38]:
X = df.text
y = df.category

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )

In [40]:
X_train.shape, X_test.shape

((642910,), (160728,))

In [41]:
X_train[524689]

"so... abusing women??? I guess karma is a bitch, huh!!! Remember your mantra...!!  So, those rules should serve you well, right??!! Good luck Creepy porn lawyer.. couldn't have happened to a nicer guy!! JERK"

In [42]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)], verbose=True)

In [43]:
clf.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 2) Processing tfidf, total=111.5min
[LibLinear].....*
optimization finished, #iter = 55
Objective value = -106844.563668
nSV = 298712
[Pipeline] ............... (step 2 of 2) Processing clf, total=  14.5s


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7f8dbb262280>)),
                ('clf', LinearSVC(verbose=True))],
         verbose=True)

In [44]:
y_pred = clf.predict(X_test)

In [45]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    141532
           1       0.88      0.59      0.71     19196

    accuracy                           0.94    160728
   macro avg       0.91      0.79      0.84    160728
weighted avg       0.94      0.94      0.94    160728



In [46]:
confusion_matrix(y_test, y_pred)

array([[140001,   1531],
       [  7799,  11397]])

In [76]:
[X_train[524689]]
clf.predict(['love women'])

array([0])

In [71]:
import pickle
filename = './data/data_nlp/classifier.sav'
pickle.dump(clf, open(filename, 'wb'))

In [70]:
df

Unnamed: 0,text,category
0,"Entitled, obnoxious, defensive, lying weasel. ...",0
1,Thank you and for what you did for the women...,0
2,Knitting (s) &amp; getting ready for January 1...,0
3,Yep just like triffeling women weaponized thei...,1
4,"No, the President wants to end movement posin...",0
...,...,...
807169,Let’s not forget that this “iconic kiss” was u...,0
807170,DEFINITELY....the only one any of us should su...,0
807171,Did the movement count the dollars of Erin An...,0
807172,This is one of my all time fav songs &amp; vid...,1


***PREDICTIONS***

In [170]:
from datetime import datetime
################################################### Keywords for data filtering #########################################
# Talking about women, general women terminology
general_dict = set(['woman', 'women', ' she ', ' she.', ' her ', ' her.', 'girl', 'daughter', 'mother',
                    'sister', 'niece', 'female', 'wife', 'spouse', 'mistress', ' aunt ', ' aunt.', ' aunts ', ' aunts.',
                    ' mom ', ' mom.', ' moms ', ' moms.', ' mum ', ' mum.', ' mums ', ' mums.', ' femme ', ' femme.', 'grandma',
                    ' lady ', ' lady.', ' ladies ', ' ladies.', ' panty ', ' panty.', ' panties ', ' panties.',
                    'madam', ' ms.', ' mrs.', ' ms ', ' mrs ', ' maid ', ' maid.', ' maids ', ' maids.', ' bride ', ' bride.', ' brides ',
                    ' brides.', ' chick ', ' bridesmaid ', ' bridesmaid.', ' bridesmaids ', ' bridesmaids.',
                    ' chick.', ' chicks ', ' chicks.', " she's ", " her's "
                    ])

# Feminine-biased nouns and substrings (Disclaimer: this doesn’t reflect our team’s gender views.)
adj_dict = set(['slut', 'gold digger', 'bitch', 'prostitut', 'bimbo', 'actress',
                ' queen ', ' queen. ', ' queens ', ' queens. ', 'princess', 'whore', ' loca ', 'goddess', 'maiden', ' loca.'
                ' petite ', ' petite.', ' petites ', ' petites.', 'duchess', 'lesbian', 'fashionista', 'doll',
                'nymph', 'cougar', 'milf', 'virgin'])

# Verbs related to women
verb_dict = set(['marri', 'sleep with', 'marry', 'abortion', 'birth control'])

# Terms/actions associated both to MeToo movement and women’s datasets
action_dict = set(['harrass', ' rape ',' rape.',' raped ',' raped.', ' rapes ', ' rapes.', ' rapist ', ' rapist.', ' raping ', ' raping.',
                   'sex', 'domestic violence', 'domestic abuse', 'misogyn'])

# MeToo dictionary
metoo_dict = set(['metoo', 'femin', 'feminism', 'feminist'])

# Personnality MeToo dictionnary
people_dict = set(['harvey weinstein', 'bill cosby', 'tarana burke', 'ambra gutierrez',
                   'anastasia melnichenko', 'alyssa milano', 'r. kelly', 'r kelly','rob kelly', 'robert kelly',
                   'larry nassar', 'reith raniere', 'allison mack',
                   'claude arnault'])

keywords = general_dict\
    .union(adj_dict)\
    .union(verb_dict)\
    .union(action_dict)\
    .union(metoo_dict)\
    .union(people_dict)

################################################# Helper functions ############################################
def generate_data_keyword(src_path, dst_path, keywords):
    with bz2.open(src_path, 'rb') as s_file:
        with bz2.open(dst_path, 'wb') as d_file:
            for instance in s_file:
                instance = json.loads(instance)
                quote = str(instance['quotation']).lower()
                for word in keywords:
                    if word in f' {quote} ':
                        d_file.write((json.dumps(instance)+'\n').encode('utf-8'))
                        break


def generate_data_monthly(src_path, dst_path, keywords):
    with bz2.open(path_to_quotes, 'rb') as s_file:
        for instance in s_file:
            instance = json.loads(instance)
            month = instance['date'][5:7]
            path_per_month = dst_path.format(month)
            with bz2.open(path_per_month, 'wb') as d_file:
                d_file.write((json.dumps(instance)+'\n').encode('utf-8'))


def generate_pickles(scr_path, dst_path, chunk_size=1e5):
    with bz2.open(dst_path, 'wb') as f:
        data_reader = pd.read_json(scr_path, lines=True, compression='bz2', chunksize=chunk_size)
        for chunk in data_reader:
            pkl.dump(chunk, f)


def get_unique_list(serie):
    '''
    find unique element, and corresponding index, of a Serie with values of type List()

    input  : [serie]  : Serie with list for value (might be empty list
    output : [unique] : List of unique element in the Serie serie
             [idx]    : index of 1st row of serie containing the corresponding [unique] value
    '''
    unique = []
    idx = []
    for iiter, i in enumerate(serie.values):
        for j in i:
            if not (j in unique):
                unique.append(j)
                idx.append(iiter)
    return unique, idx


def get_week(dataframe, col):
    return dataframe[col].apply(lambda x: datetime.fromisoformat(x[:-7]).isocalendar()[1])


def get_month(dataframe, col):
    return dataframe[col].apply(lambda x: int(x[5:7]))


def split_quotes_per_gender(chunk, df_selected_parquet, qid_male, qid_female, qids_others, qids_wrong):
    chunk['week']   = get_week(chunk, 'quoteID')
    chunk['month']  = get_month(chunk, 'quoteID')
    
    #_________________
    # SPEAKER NONE
    #`````````````````
    q_is_speaker_None = chunk.speaker=='None' # Checker auusi pou les NaN -> isna()
    q_noSpeaker       = chunk[ q_is_speaker_None].copy()
    q_speaker         = chunk[-q_is_speaker_None]
    q_speaker['qid'] = q_speaker.qids.apply(lambda x: x[0]) # 1st homonym
    q_speaker = q_speaker.drop(columns=['qids'])
    
    
    # Merge with Parquet
    q_speaker = q_speaker.merge(df_selected_parquet, left_on='qid', right_on='id', how='left')
    
    #______________
    # SPEAKER NO PARQUET
    #``````````````
    q_speaker_not_in_parquet = q_speaker.id.isna()
    q_speaker_noParquet  = q_speaker[q_speaker_not_in_parquet].copy()
    
    q_speaker  = q_speaker[-q_speaker_not_in_parquet]
    
    '''
    #__________________
    # SPEAKER NO LABEL
    #``````````````````
    q_is_speaker_labeled = q_speaker.qid.isin(df_qid.QID)
    q_speaker_noLabel = q_speaker[ -q_is_speaker_labeled]
    q_speaker = q_speaker[ q_is_speaker_labeled]
    '''
    
    #______
    # NONE
    #``````
    q_gender_None = q_speaker.gender.isna()
    q_None    = q_speaker[ q_gender_None].copy()
    q_speaker = q_speaker[-q_gender_None]
    
    q_speaker['gender'] = q_speaker.gender.apply(lambda x: x[0]) # keep only 1st gender
    
    #________________________________
    # MALE - FEMALE - OTHERS - WRONG - NOLABEL
    #````````````````````````````````
    q_is_gender_labeled = q_speaker.gender.isin(df_qid.QID)
    q_is_gender_male = q_speaker.gender.isin(qid_male)
    q_is_gender_female = q_speaker.gender.isin(qid_female)
    q_is_gender_others = q_speaker.gender.isin(qids_others)
    q_is_gender_wrong = q_speaker.gender.isin(qids_wrong)
    
    
    q_noLabel  = q_speaker[q_is_gender_labeled].copy()
    q_male     = q_speaker[q_is_gender_male].copy()
    q_female   = q_speaker[q_is_gender_female].copy()
    q_others   = q_speaker[q_is_gender_others].copy()
    q_wrong    = q_speaker[q_is_gender_wrong].copy()
    
    return q_male, q_female, q_others, q_wrong, q_noLabel, q_None, q_speaker_noParquet, q_noSpeaker

In [None]:
PATH_DATA = './../data/'
PATH_PARQUET = PATH_DATA + 'parquet/'
df_parquet = pd.read_parquet(PARQUET_FILE)
df_parquet.head()

In [79]:
year = 2016 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + f'quotes-{year}-filtered.json.bz2'
CHUNK_SIZE = 500

reader = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', chunksize=CHUNK_SIZE, typ='frame')

chunks = [] #utile pour plus loins quand on fait le feature extraction
i=0
for chunk in reader:
    df_0 = chunk
    break
df_0.quotation[0]

"[ Malia ] knows what she is going to do. They have a plan for her and her family feels comfortable knowing that it's not something unstructured,"

In [182]:
year = 2020 # available: from 2015 to 2020
PATH_DATA = './data/data_nlp/'
QUOTES_FILE = PATH_DATA + 'metoo_2019_04.json.bz2'

df = pd.read_json(QUOTES_FILE, lines=True, compression='bz2', typ='frame')


In [141]:
df_new = df.iloc[:100]

In [142]:
df_new['Hatred'] = df_new.quotation.apply(lambda quote: clf.predict([quote]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Hatred'] = df_new.quotation.apply(lambda quote: clf.predict([quote]))


In [158]:
s = df_new[df_new.Hatred == 1].quotation.iloc[3]

In [159]:
def split(text, n=100):
    return []

'Thankfully, with places like Optimism Place, we have supports put in place for people who are involved, or who are victims or survivors of domestic violence and we have the ability to help those people through those problems -- not only with the survivors, but with those families as well in helping to prevent any further problems from happening.'

In [164]:
s_s = [s[100*i:100*(i+1)] for i in range(len(s)//100)] + [s[100*(len(s)//100):]];s_s

['Thankfully, with places like Optimism Place, we have supports put in place for people who are involv',
 'ed, or who are victims or survivors of domestic violence and we have the ability to help those peopl',
 'e through those problems -- not only with the survivors, but with those families as well in helping ',
 'to prevent any further problems from happening.']

In [167]:
for s in s_s:
    clf.predict([s])


In [168]:
clf.predict(s_s)

array([0, 1, 1, 0])

In [209]:

df.to_json(compression='bz2',path_or_buf='./data/data_nlp/TEST.json', orient='index')

In [210]:
DF = pd.read_json('./data/data_nlp/TEST.json', lines=True, compression='bz2', typ='frame')
DF


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9904,9905,9906,9907,9908,9909,9910,9911,9912,9913
0,"{'quoteID': '2019-04-14-028425', 'quotation': ...","{'quoteID': '2019-04-12-006214', 'quotation': ...","{'quoteID': '2019-04-01-064419', 'quotation': ...","{'quoteID': '2019-04-09-075194', 'quotation': ...","{'quoteID': '2019-04-26-057136', 'quotation': ...","{'quoteID': '2019-04-15-036313', 'quotation': ...","{'quoteID': '2019-04-25-060160', 'quotation': ...","{'quoteID': '2019-04-02-073771', 'quotation': ...","{'quoteID': '2019-04-28-038981', 'quotation': ...","{'quoteID': '2019-04-03-005994', 'quotation': ...",...,"{'quoteID': '2019-04-03-007529', 'quotation': ...","{'quoteID': '2019-04-11-009600', 'quotation': ...","{'quoteID': '2019-04-11-078151', 'quotation': ...","{'quoteID': '2019-04-01-088752', 'quotation': ...","{'quoteID': '2019-04-12-052424', 'quotation': ...","{'quoteID': '2019-04-05-099481', 'quotation': ...","{'quoteID': '2019-04-19-004730', 'quotation': ...","{'quoteID': '2019-04-09-015136', 'quotation': ...","{'quoteID': '2019-04-05-022314', 'quotation': ...","{'quoteID': '2019-04-07-025206', 'quotation': ..."


In [197]:
DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9904,9905,9906,9907,9908,9909,9910,9911,9912,9913
0,"{'quoteID': '2019-04-14-028425', 'quotation': ...","{'quoteID': '2019-04-12-006214', 'quotation': ...","{'quoteID': '2019-04-01-064419', 'quotation': ...","{'quoteID': '2019-04-09-075194', 'quotation': ...","{'quoteID': '2019-04-26-057136', 'quotation': ...","{'quoteID': '2019-04-15-036313', 'quotation': ...","{'quoteID': '2019-04-25-060160', 'quotation': ...","{'quoteID': '2019-04-02-073771', 'quotation': ...","{'quoteID': '2019-04-28-038981', 'quotation': ...","{'quoteID': '2019-04-03-005994', 'quotation': ...",...,"{'quoteID': '2019-04-03-007529', 'quotation': ...","{'quoteID': '2019-04-11-009600', 'quotation': ...","{'quoteID': '2019-04-11-078151', 'quotation': ...","{'quoteID': '2019-04-01-088752', 'quotation': ...","{'quoteID': '2019-04-12-052424', 'quotation': ...","{'quoteID': '2019-04-05-099481', 'quotation': ...","{'quoteID': '2019-04-19-004730', 'quotation': ...","{'quoteID': '2019-04-09-015136', 'quotation': ...","{'quoteID': '2019-04-05-022314', 'quotation': ...","{'quoteID': '2019-04-07-025206', 'quotation': ..."


In [217]:
df.to_json('./data/data_nlp/TEST2.json.bz2', orient='records', lines=True, compression='gzip')

In [220]:
DF = pd.read_json('./data/data_nlp/TEST2.json.bz2', lines=True, compression='bz2')
DF


Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,"{'0': '2019-04-14-028425', '1': '2019-04-12-00...",{'0': 'It's been a real kick-starter for great...,"{'0': 'Lena Headey', '1': 'None', '2': 'Jeff Y...","{'0': ['Q228789'], '1': [], '2': ['Q6175309'],...","{'0': 1555236000000, '1': 1555027200000, '2': ...","{'0': 25, '1': 9, '2': 1, '3': 1, '4': 1, '5':...","{'0': [['Lena Headey', '0.922'], ['None', '0.0...",{'0': ['https://www.news.com.au/entertainment/...,"{'0': 'E', '1': 'E', '2': 'E', '3': 'E', '4': ..."


In [223]:
df.to_json('./data/data_nlp/test.json.bz2', orient='records', lines=True, compression='bz2')

In [224]:
DF = pd.read_json('./data/data_nlp/test.json.bz2', lines=True, compression='bz2')
DF

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase
0,2019-04-14-028425,It's been a real kick-starter for great roles ...,Lena Headey,[Q228789],2019-04-14 10:00:00,25,"[[Lena Headey, 0.922], [None, 0.078]]",[https://www.news.com.au/entertainment/tv/game...,E
1,2019-04-12-006214,and warmly welcomes everyone to the club regar...,,[],2019-04-12 00:00:00,9,"[[None, 0.521], [Raelene Castle, 0.4254], [Bil...",[http://www.readingeagle.com/ap/article/folau-...,E
2,2019-04-01-064419,My thoughts and prayers are with the family me...,Jeff Yurek,[Q6175309],2019-04-01 00:00:00,1,"[[Jeff Yurek, 0.9581], [None, 0.0419]]",[https://www.woodstocksentinelreview.com/news/...,E
3,2019-04-09-075194,"Race, color, national origin, religion, sex, f...",Governor Kim Reynolds,[Q6409269],2019-04-09 22:14:04,1,"[[Governor Kim Reynolds, 0.9014], [None, 0.0986]]",[https://kwwl.com/news/top-stories/2019/04/09/...,E
4,2019-04-26-057136,"The other topics, I would be lying if I told y...",Moses Moreno,[Q6915887],2019-04-26 00:00:00,1,"[[Moses Moreno, 0.7883], [None, 0.1616], [Andr...",[http://kunr.org/post/will-washoe-have-new-sex...,E
...,...,...,...,...,...,...,...,...,...
9909,2019-04-05-099481,"There's a stigma that people are bad, somehow,...",John Herrington,[Q45082],2019-04-05 23:12:30,1,"[[John Herrington, 0.7944], [None, 0.2056]]",[http://www.wdam.com/2019/04/05/pine-belt-wome...,E
9910,2019-04-19-004730,blatantly mocked survivors of sexual assault a...,Camille Paglia,[Q255463],2019-04-19 20:52:00,1,"[[Camille Paglia, 0.7991], [None, 0.2009]]",[https://www.pinknews.co.uk/2019/04/19/camille...,E
9911,2019-04-09-015136,"Clean, half-brogue, Oxford monkstrap and whole...",Edward Sexton,[Q5345282],2019-04-09 00:00:00,1,"[[Edward Sexton, 0.686], [None, 0.314]]",[https://robbreport.com/style/footwear/brown-s...,E
9912,2019-04-05-022314,"For example, the JAS specialists will carefull...",Mark Speakman,[Q6769797],2019-04-05 12:31:08,1,"[[Mark Speakman, 0.4881], [Mr Ward, 0.345], [N...",[https://www.miragenews.com/statewide-justice-...,E
