In [1]:
import pandas as pd

import seaborn
import matplotlib.pyplot as plt

import nltk
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy.lang.en.stop_words import STOP_WORDS as STOP_WORDS_EN
from spacy.lang.fr.stop_words import STOP_WORDS as STOP_WORDS_FR

from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer

In [2]:
import re

In [3]:
!python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.3.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [5]:
df_raw = pd.read_json('../../data/tweets-996727069798264832.json'); df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   created_at  300 non-null    datetime64[ns, UTC]
 1   id          300 non-null    int64              
 2   author_id   300 non-null    int64              
 3   text        300 non-null    object             
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 9.5+ KB


In [6]:
df_raw

Unnamed: 0,created_at,id,author_id,text
0,2022-05-01 12:42:03+00:00,1520745362004262912,996727069798264832,"@SouadH9 ptdrrr, c'est génial ça quand t'as la..."
1,2022-05-01 12:02:51+00:00,1520735499601780736,996727069798264832,@lexfridman Seems like you remember of Cambrid...
2,2022-05-01 11:29:49+00:00,1520727184352108544,996727069798264832,"@benjamincode Bon courage à toi, je suis entra..."
3,2022-04-30 17:34:53+00:00,1520456670979276800,996727069798264832,@Pasdepression_ Je suis dans le marché ultra c...
4,2022-04-30 16:08:48+00:00,1520435006392483840,996727069798264832,@logiezer @OlivierReims @Console_buche @fchaba...
...,...,...,...,...
295,2020-08-03 07:08:26+00:00,1290182725299625984,996727069798264832,@Prathkum @GyenAbubakar Nope
296,2020-07-30 06:43:06+00:00,1288726796800864256,996727069798264832,@SchmiegSophie @__apf__ Maybe he just like pho...
297,2020-07-22 21:43:51+00:00,1286054375731978240,996727069798264832,"@BKTrending @denicmarko Java, JS and HTML/CSS ..."
298,2020-07-22 21:40:34+00:00,1286053550901125120,996727069798264832,@BKTrending @denicmarko There is no better lan...


In [7]:
df_raw['text'] = df_raw['text'].apply(lambda f: f.lower())

## Data processing

### Tokenization

In [8]:
def processing_text_to_token(series):
    token = []
    for comment in series.apply(str):
        Word_Tok = []
        for word in  re.sub("\W"," ",comment ).split():
            Word_Tok.append(word)
        token.append(Word_Tok)
    return token

In [9]:
tokens = processing_text_to_token(df_raw['text'])

In [10]:
tokens[:2]

[['souadh9',
  'ptdrrr',
  'c',
  'est',
  'génial',
  'ça',
  'quand',
  't',
  'as',
  'la',
  'connaissance',
  'du',
  'domaine',
  'pour',
  'mesurer',
  'à',
  'quel',
  'point',
  'c',
  'est',
  'faux',
  'xd'],
 ['lexfridman',
  'seems',
  'like',
  'you',
  'remember',
  'of',
  'cambridge',
  'analytica',
  'and',
  'the',
  'power',
  'tech',
  'giant',
  'have',
  'am',
  'i',
  'right']]

In [11]:
df_raw['token'] = tokens

In [12]:
df_raw

Unnamed: 0,created_at,id,author_id,text,token
0,2022-05-01 12:42:03+00:00,1520745362004262912,996727069798264832,"@souadh9 ptdrrr, c'est génial ça quand t'as la...","[souadh9, ptdrrr, c, est, génial, ça, quand, t..."
1,2022-05-01 12:02:51+00:00,1520735499601780736,996727069798264832,@lexfridman seems like you remember of cambrid...,"[lexfridman, seems, like, you, remember, of, c..."
2,2022-05-01 11:29:49+00:00,1520727184352108544,996727069798264832,"@benjamincode bon courage à toi, je suis entra...","[benjamincode, bon, courage, à, toi, je, suis,..."
3,2022-04-30 17:34:53+00:00,1520456670979276800,996727069798264832,@pasdepression_ je suis dans le marché ultra c...,"[pasdepression_, je, suis, dans, le, marché, u..."
4,2022-04-30 16:08:48+00:00,1520435006392483840,996727069798264832,@logiezer @olivierreims @console_buche @fchaba...,"[logiezer, olivierreims, console_buche, fchaba..."
...,...,...,...,...,...
295,2020-08-03 07:08:26+00:00,1290182725299625984,996727069798264832,@prathkum @gyenabubakar nope,"[prathkum, gyenabubakar, nope]"
296,2020-07-30 06:43:06+00:00,1288726796800864256,996727069798264832,@schmiegsophie @__apf__ maybe he just like pho...,"[schmiegsophie, __apf__, maybe, he, just, like..."
297,2020-07-22 21:43:51+00:00,1286054375731978240,996727069798264832,"@bktrending @denicmarko java, js and html/css ...","[bktrending, denicmarko, java, js, and, html, ..."
298,2020-07-22 21:40:34+00:00,1286053550901125120,996727069798264832,@bktrending @denicmarko there is no better lan...,"[bktrending, denicmarko, there, is, no, better..."


### Vocabulary

In [13]:
stop_words=set(STOP_WORDS_FR)

In [14]:
WORDS_FR_TO_REMOVE = set(['qu', 'faire', 'faut', 'ans'])
stop_words.update(WORDS_FR_TO_REMOVE)

In [15]:
len(stop_words)

511

In [16]:
stop_words.update(STOP_WORDS_EN)

In [17]:
len(stop_words)

829

In [18]:
def filtering_series(series):
    AllfilteredText=[]
    for text in series:
        filteredText = [w for w in text if not ((w in stop_words) or (len(w) == 1))]
        AllfilteredText.append(' '.join(filteredText))
    return AllfilteredText

In [19]:
text_processed = filtering_series(df_raw['token'])

In [20]:
text_processed[:2]

['souadh9 ptdrrr génial connaissance domaine mesurer point faux xd',
 'lexfridman like remember cambridge analytica power tech giant right']

In [21]:
df_raw['text_processed'] = text_processed

## Model inference & analysis

In [22]:
def generate_sentiment(df, series_key):
    tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
    sentiment_list = []
    polarity_list = []
    sentiment_type = []
    polarity_type = []
    
    for i in df[series_key]:
        sentiment = tb(i).sentiment[0]
        polarity = tb(i).sentiment[1]
            
        if (sentiment > .2): 
            sentiment_type.append('Positive')
        elif (sentiment < -.2): 
            sentiment_type.append('Negative')
        else:
            sentiment_type.append('Neutral')
        
        if (polarity > .8): 
            polarity_type.append('Subjective')
        elif (polarity < .2): 
            polarity_type.append('Objective')
        else:
            polarity_type.append('Neutral')

        sentiment_list.append(sentiment)
        polarity_list.append(polarity)

    df['sentiment'] = sentiment_list
    df['sentiment_type'] = sentiment_type
    df['polarity'] = polarity
    df['polarity_type'] = polarity_type
        
    return df

In [23]:
df_sentiment = generate_sentiment(df_raw, 'text_processed')

In [24]:
df_sentiment

Unnamed: 0,created_at,id,author_id,text,token,text_processed,sentiment,sentiment_type,polarity,polarity_type
0,2022-05-01 12:42:03+00:00,1520745362004262912,996727069798264832,"@souadh9 ptdrrr, c'est génial ça quand t'as la...","[souadh9, ptdrrr, c, est, génial, ça, quand, t...",souadh9 ptdrrr génial connaissance domaine mes...,0.200000,Neutral,0.0,Neutral
1,2022-05-01 12:02:51+00:00,1520735499601780736,996727069798264832,@lexfridman seems like you remember of cambrid...,"[lexfridman, seems, like, you, remember, of, c...",lexfridman like remember cambridge analytica p...,0.000000,Neutral,0.0,Objective
2,2022-05-01 11:29:49+00:00,1520727184352108544,996727069798264832,"@benjamincode bon courage à toi, je suis entra...","[benjamincode, bon, courage, à, toi, je, suis,...",benjamincode bon courage entrain remplir sac d...,0.700000,Positive,0.0,Neutral
3,2022-04-30 17:34:53+00:00,1520456670979276800,996727069798264832,@pasdepression_ je suis dans le marché ultra c...,"[pasdepression_, je, suis, dans, le, marché, u...",pasdepression_ marché ultra compétitif data sc...,0.100000,Neutral,0.0,Objective
4,2022-04-30 16:08:48+00:00,1520435006392483840,996727069798264832,@logiezer @olivierreims @console_buche @fchaba...,"[logiezer, olivierreims, console_buche, fchaba...",logiezer olivierreims console_buche fchabanois...,-0.116667,Neutral,0.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
295,2020-08-03 07:08:26+00:00,1290182725299625984,996727069798264832,@prathkum @gyenabubakar nope,"[prathkum, gyenabubakar, nope]",prathkum gyenabubakar nope,0.000000,Neutral,0.0,Objective
296,2020-07-30 06:43:06+00:00,1288726796800864256,996727069798264832,@schmiegsophie @__apf__ maybe he just like pho...,"[schmiegsophie, __apf__, maybe, he, just, like...",schmiegsophie __apf__ maybe like photography,0.000000,Neutral,0.0,Objective
297,2020-07-22 21:43:51+00:00,1286054375731978240,996727069798264832,"@bktrending @denicmarko java, js and html/css ...","[bktrending, denicmarko, java, js, and, html, ...",bktrending denicmarko java js html css job cpp...,0.000000,Neutral,0.0,Objective
298,2020-07-22 21:40:34+00:00,1286053550901125120,996727069798264832,@bktrending @denicmarko there is no better lan...,"[bktrending, denicmarko, there, is, no, better...",bktrending denicmarko better language tools de...,0.000000,Neutral,0.0,Objective


In [25]:
df_sentiment.polarity_type

0        Neutral
1      Objective
2        Neutral
3      Objective
4        Neutral
         ...    
295    Objective
296    Objective
297    Objective
298    Objective
299    Objective
Name: polarity_type, Length: 300, dtype: object

In [26]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

def setup_spacy(nlp):
    Language.factory("language_detector", func=get_lang_detector)
    nlp.add_pipe('language_detector', last=True)
    
def get_current_language(text):    
    doc = nlp(text)
    # print(doc._.language)
    if doc._.language['score'] > .85:
        return doc._.language['language']
    else:
        return 'multiple'

In [27]:
setup_spacy(nlp)

In [28]:
df_sentiment['language'] = df_sentiment.text_processed.apply(lambda f: get_current_language(f))

In [29]:
df_sentiment.language.value_counts()

fr          166
en           77
multiple     37
af            4
it            2
es            2
ca            2
sv            1
so            1
hu            1
fi            1
nl            1
sl            1
cy            1
tr            1
da            1
ro            1
Name: language, dtype: int64