In [86]:
import pandas as pd 
import os
import spacy
from sklearn.model_selection import train_test_split


#### Loading data

In [35]:
csv_file_path = os.path.join("..", "data", "IMDB Dataset.csv")
df = pd.read_csv(csv_file_path)


In [94]:
X = df['review']  
y = df['sentiment'] 

# Division en parties d'entraînement (80%) et de validation (20%)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

print("Taille de l'ensemble d'entraînement :", len(X_train))
print("Taille de l'ensemble de validation :", len(X_valid))

Taille de l'ensemble d'entraînement : 35000
Taille de l'ensemble de validation : 15000


In [95]:
X_train.reset_index(drop=True, inplace=True)


In [100]:
X_train_df = pd.DataFrame({'review': X_train}, columns=['review'])

X_valid_df = pd.DataFrame({'review': X_valid}, columns=['review'])

y_train_df = pd.DataFrame({'sentiment': y_train}, columns=['sentiment'])

y_valid_df = pd.DataFrame({'sentiment': y_valid}, columns=['sentiment'])

#### Preprocessing : label distribution , tokenization and lemmatisation

In [47]:
positive_label_count  = df[df["sentiment"] == "positive"].shape[0]
negative_label_count = df[df["sentiment"] == "negative"].shape[0]

In [49]:
positive_label_proportion = (positive_label_count / (positive_label_count+ negative_label_count))*100

In [50]:
negative_label_proportion= (negative_label_count / (positive_label_count+ negative_label_count))*100

In [52]:
print('la proportion de sentiments positives est de ', positive_label_proportion)
print('la proportion de sentiments négatifs est de ', negative_label_proportion)

la proportion de sentiments positives est de  50.0
la proportion de sentiments négatifs est de  50.0


In [53]:
print("on constate on a autant de labels positifs que de labels négatifs")

on constate on a autant de labels positifs que de labels négatifs


In [72]:
import spacy

try:
    #rem: enable does not seem to work, don't know why
    nlp = spacy.load("en_core_web_sm") #, enable=["tok2vec", "morphologizer", "lemmatizer"])
except OSError:
    !python -m spacy download fr_core_news_md
    nlp = spacy.load("en_core_web_sm") #, enable=["tok2vec", "morphologizer", "lemmatizer"])

# we won't need ner, parser, attribute_ruler
# NB: tok2vec and morphologizer seem to be necessary for lemmatization
nlp.remove_pipe("ner")
nlp.remove_pipe("parser")
nlp.remove_pipe("attribute_ruler")
print('Current pipeline:\n  '+'\n  '.join(nlp.pipe_names))


Current pipeline:
  tok2vec
  tagger
  lemmatizer


In [112]:
def tokenize_text(text) :
    doc = nlp(text)
    tokens =[token.text for token in doc ]
    return tokens

def lemmatize_text(text) :
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc ]
    return lemmas 
    

In [137]:
X_valid_df['tokens'] = X_train_df['review'].apply(tokenize_text)



In [113]:
%%time
X_train_df['lemmas'] = X_train_df['review'].apply(lemmatize_text)



CPU times: user 7min 55s, sys: 5.97 s, total: 8min 1s
Wall time: 8min 2s


In [114]:
X_train_df

Unnamed: 0,review,tokens,lemmas
0,"As much as I love trains, I couldn't stomach t...","[As, much, as, I, love, trains, ,, I, could, n...","[as, much, as, i, love, trains, ,, i, could, n..."
1,"This was a very good PPV, but like Wrestlemani...","[This, was, a, very, good, PPV, ,, but, like, ...","[this, was, a, very, good, ppv, ,, but, like, ..."
2,Not finding the right words is everybody's pro...,"[Not, finding, the, right, words, is, everybod...","[not, finding, the, right, words, is, everybod..."
3,I'm really suprised this movie didn't get a hi...,"[I, 'm, really, suprised, this, movie, did, n'...","[i, 'm, really, suprised, this, movie, did, n'..."
4,I'll start by confessing that I tend to really...,"[I, 'll, start, by, confessing, that, I, tend,...","[i, 'll, start, by, confessing, that, i, tend,..."
...,...,...,...
34995,`Shadow Magic' recaptures the joy and amazemen...,"[`, Shadow, Magic, ', recaptures, the, joy, an...","[`, shadow, magic, ', recaptures, the, joy, an..."
34996,I found this movie to be quite enjoyable and f...,"[I, found, this, movie, to, be, quite, enjoyab...","[i, found, this, movie, to, be, quite, enjoyab..."
34997,Avoid this one! It is a terrible movie. So wha...,"[Avoid, this, one, !, It, is, a, terrible, mov...","[avoid, this, one, !, it, is, a, terrible, mov..."
34998,This production was quite a surprise for me. I...,"[This, production, was, quite, a, surprise, fo...","[this, production, was, quite, a, surprise, fo..."


In [116]:
tokens_vocab_size = len(set(token for tokens in X_train_df['tokens'] for token in tokens))
lemmas_vocab_size = len(set(lemma for lemmas in X_train_df['lemmas'] for lemma in lemmas))
print("Taille du vocabulaire pour les tokens  :", tokens_vocab_size)
print("Taille du vocabulaire pour les lemmes :", lemmas_vocab_size)

Taille du vocabulaire pour les tokens  : 146293
Taille du vocabulaire pour les lemmes : 121716


In [138]:
X_valid_df['tokens'] = X_valid_df['review'].apply(tokenize_text)



In [139]:
X_valid_df

Unnamed: 0,review,tokens
33553,I really liked this Summerslam due to the look...,"[I, really, liked, this, Summerslam, due, to, ..."
9427,Not many television shows appeal to quite as m...,"[Not, many, television, shows, appeal, to, qui..."
199,The film quickly gets to a major chase scene w...,"[The, film, quickly, gets, to, a, major, chase..."
12447,Jane Austen would definitely approve of this o...,"[Jane, Austen, would, definitely, approve, of,..."
39489,Expectations were somewhat high for me when I ...,"[Expectations, were, somewhat, high, for, me, ..."
...,...,...
15168,"""Landscape after a battle"" opens with escaping...","["", Landscape, after, a, battle, "", opens, wit..."
49241,Jake Speed (1986) was an amusing parody of Ind...,"[Jake, Speed, (, 1986, ), was, an, amusing, pa..."
39317,"PLAN B has the appearance of a quickly made, u...","[PLAN, B, has, the, appearance, of, a, quickly..."
42191,One of the perks of my job is that when things...,"[One, of, the, perks, of, my, job, is, that, w..."


#### Bag-of-Words" (BoW) vector representation

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, max_features=None, stop_words=None)
X_train_tfidf = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in X_train_df["tokens"]])

In [134]:
print(X_train_tfidf)

  (0, 1447)	0.1493097122938871
  (0, 18975)	0.06293896883695577
  (0, 43875)	0.04249173758699013
  (0, 19445)	0.1764950058221125
  (0, 45207)	0.06914440146338069
  (0, 26953)	0.05132252841035166
  (0, 20278)	0.1306669672020459
  (0, 10208)	0.13271826636134537
  (0, 50884)	0.1833670133075118
  (0, 44441)	0.13643068932689864
  (0, 3687)	0.1863370134256986
  (0, 5362)	0.11148535762233444
  (0, 41460)	0.12470861484962832
  (0, 28814)	0.09691601952673515
  (0, 50140)	0.04246419381736024
  (0, 6649)	0.025742473523883168
  (0, 12763)	0.1651702003383859
  (0, 33876)	0.06612525782477688
  (0, 19470)	0.03803616885733382
  (0, 47449)	0.07977678912156905
  (0, 21157)	0.15469643569182906
  (0, 26605)	0.19414905434794
  (0, 4666)	0.05101406750862695
  (0, 31865)	0.1898497779871838
  (0, 36389)	0.19414905434794
  :	:
  (34999, 934)	0.2307420710871859
  (34999, 27864)	0.06698885635286218
  (34999, 2630)	0.03574082572229232
  (34999, 21402)	0.12908472375445057
  (34999, 19802)	0.1617526157870924
  (349