In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/notagain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/notagain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('/Users/notagain/Desktop/Trust_pilot-1/fabian/NLP/text.csv')
df.drop(['Unnamed: 0', "company", "review"], axis=1, inplace=True)



# Gradient Boosting Tree 

Gradient Boosting Tree is a powerful tool in Sentiment Analysis, that consists in finding the weights that optimize the cost function relative to the classification problem.

To be able to use this algorithm with Scikit-learn, it is essential to convert the strings of the explanatory variable into numerical tokens. So we will use the bag of words algorithm.



In [3]:
X, y = df.text, df.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)


X, y = df.text, df.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 30)


vectorizer = CountVectorizer()


X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)


y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))




confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual class'], colnames=['Predicted class'])
confusion_matrix


              precision    recall  f1-score   support

           1       0.64      0.58      0.61      3636
           2       0.24      0.04      0.07      1491
           3       0.44      0.35      0.39      3000
           4       0.16      0.01      0.02       915
           5       0.83      0.97      0.89     18959

    accuracy                           0.77     28001
   macro avg       0.46      0.39      0.40     28001
weighted avg       0.71      0.77      0.73     28001



Predicted class,1,2,3,4,5
Actual class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2113,90,396,7,1030
2,465,61,404,3,558
3,416,80,1061,13,1430
4,42,3,191,11,668
5,242,24,337,34,18322


# TF-IDF


We would like to compare the CountVectorizer method with the second one proposed in the pre-processing module: the TF-IDF. We'll proceed in exactly the same way as in the previous section.

In [4]:

X_tfidf, y_tfidf = df.text, df.rating


X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state = 30)

vec_tfidf = TfidfVectorizer()


X_train_tfidf = vec_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf = vec_tfidf.transform(X_test_tfidf)


clf_tfidf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)


In [5]:
conf_matrix_tfidf = pd.crosstab(y_test_tfidf, y_pred_tfidf, rownames=['Actual class'], colnames=['Predicted class'])
print( classification_report(y_test, y_pred) )


conf_matrix_tfidf



              precision    recall  f1-score   support

           1       0.64      0.58      0.61      3636
           2       0.24      0.04      0.07      1491
           3       0.44      0.35      0.39      3000
           4       0.16      0.01      0.02       915
           5       0.83      0.97      0.89     18959

    accuracy                           0.77     28001
   macro avg       0.46      0.39      0.40     28001
weighted avg       0.71      0.77      0.73     28001



Predicted class,1,2,3,4,5
Actual class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1592,110,355,15,1564
2,460,89,220,8,714
3,517,243,482,15,1743
4,61,69,73,6,706
5,292,96,313,31,18227


# Stemming

Let's see if the pre-processing of our data influences our performance. First, we'll try to group the words present in the sentences of our corpus according to common lexical origins.



In [6]:
stemmer = PorterStemmer()

def stemming(words):
    return list({stemmer.stem(word) for word in words})


X_stem, y_stem = df.text, df.rating
X_stem = X_stem.str.split().apply(lambda x: ' '.join(stemming(x)))

X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(X_stem, y_stem, test_size=0.2, random_state = 30)

vec_stem = TfidfVectorizer()
X_train_stem = vec_stem.fit_transform(X_train_stem)
X_test_stem = vec_stem.transform(X_test_stem)

clf_stem = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_stem, y_train_stem)

y_pred_stem = clf_stem.predict(X_test_stem)

In [7]:
print(classification_report(y_test_stem, y_pred_stem))


conf_matrix_stem = pd.crosstab(y_test_stem, y_pred_stem, rownames=['Actual class'], colnames=['Predicted class'])
conf_matrix_stem


              precision    recall  f1-score   support

           1       0.63      0.57      0.60      3636
           2       0.21      0.03      0.06      1491
           3       0.44      0.35      0.39      3000
           4       0.20      0.03      0.06       915
           5       0.83      0.96      0.89     18959

    accuracy                           0.77     28001
   macro avg       0.46      0.39      0.40     28001
weighted avg       0.71      0.77      0.73     28001



Predicted class,1,2,3,4,5
Actual class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2088,68,380,15,1085
2,476,48,401,11,555
3,438,70,1040,43,1409
4,33,11,197,32,642
5,258,33,348,56,18264


# Lemmatization


Lemmatization is a technique similar to stemming, but more advanced. It consists of transforming words into their lemmas, a finer and more precise way of finding the origin of a word without iteratively truncating it

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatization(words):
    return list({wordnet_lemmatizer.lemmatize(word) for word in words})


X_lem, y_lem = df.text, df.rating

X_lem = X_lem.str.split().apply(lambda x: ' '.join(lemmatization(x)))



X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, test_size=0.2, random_state = 30)

vec_lem = TfidfVectorizer()


X_train_lem = vec_lem.fit_transform(X_train_lem)
X_test_lem = vec_lem.transform(X_test_lem)



clf_lem = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_lem, y_train_lem)


y_pred_lem = clf_lem.predict(X_test_lem)

print( classification_report(y_test_lem, y_pred_lem) )


conf_matrix_lem = pd.crosstab(y_test_lem, y_pred_lem, rownames=['Actual class'], colnames=['Predicted class'])
conf_matrix_lem


              precision    recall  f1-score   support

           1       0.63      0.57      0.60      3636
           2       0.22      0.06      0.09      1491
           3       0.44      0.35      0.39      3000
           4       0.24      0.05      0.08       915
           5       0.84      0.96      0.90     18959

    accuracy                           0.77     28001
   macro avg       0.48      0.40      0.41     28001
weighted avg       0.72      0.77      0.73     28001



Predicted class,1,2,3,4,5
Actual class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2084,143,411,14,984
2,473,87,380,10,541
3,446,119,1048,54,1333
4,31,12,197,46,629
5,257,27,356,64,18255


Regex and StopWord


In [10]:

df["text"] = df["text"].str.lower()
df["text"].head()

stop_words = set(stopwords.words('english'))
new_stop_words = [",", ".", "``", "@", "*", "(", ")", "...", "!", "?", "-", "_", ">", "<", ":", "/", "=", "--", "©", "~", ";", "\\", "\\\\"]
df['text'] = df['text'].apply(lambda x: ' '.join(x))
df['text'].head()

df['text'] = df['text'].apply(lambda x: re.sub(r"\.+", '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r"[0-9]+", '', x))
filtered = df['text'].tolist()



X_vf, y_vf = df.text, df.rating


X_vf = X_vf.str.split()

X_vf = list(map(lambda x: ' '.join(lemmatization(x)), X_vf))

X_train_vf, X_test_vf, y_train_vf, y_test_vf = train_test_split(X_vf, y_vf, test_size=0.2, random_state = 30)

vec_vf = TfidfVectorizer()


X_train_vf = vec_vf.fit_transform(X_train_vf)
X_test_vf = vec_vf.transform(X_test_vf)


clf_vf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train_vf, y_train_vf)


y_pred_vf = clf_vf.predict(X_test_vf)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [11]:
print(X_train_vf)


['t k v q p d w ! o u y h c s r l i n m a g e', 't k v p w ! o u y h c s b r l i n m a g e', 't k p d o u y h c s b r l n j i a m x g e', 't k v ’ p w d o u y h f , c s b r l i n m a g e', "t k v q p d w x o u y h f , c ' s b r z l i n m j a g e", 't k v ’ q p d w ! o u y h f , c s b r l i n a m j x g e', 't v q p d w x o u y h f c s b r l i n m a g e', 't k v p d w x o y h f , c s b r l i n m a g e', "t k v p d w o y h f c ' s b r l i n m a g e", 't v ’ p w d o y h c s b r l i n m a e', 't k v p d w x o u y h f c s b r l n i m a g e', 't k v - q p w d o u y h f , c s b r z l i n j m a g e', 'b t r : f , o c i n a p m d e x', 't k v q p ( d w x o u ) y h " f , c $ s b r z l i n m j a g e', 't k v p d ⭐ o u y h f , c s r ️ l i n a g e', 't k v ’ p d w o u + y h f , c s b r l n i m a g e', 't k v ’ p ( d w x o u ) y h f , c s b r l i n m j a g e', 't k v q p d w x o u y h f , c s b r l n i m a g e', "t k - v q p d w o u y h f , c ' s b r l n i m a g e", 't k v ’ - q p d w ! x o u y h f ,

In [None]:
print(classification_report(y_test_vf, y_pred_vf) )


conf_matrix_vf = pd.crosstab(y_test_vf, y_pred_vf, rownames=['Actual class'], colnames=['Predicted class'])
conf_matrix_vf

In [None]:
keep your answers short in order to save tokens on the context window. do not return any comments, only code and modular functions optimized for efficiency", if asynchronous functions can be used, please do so.

create a sklearn pipeline to compare if stemming and lematization, regex and stopwords has a positive effect on performance on each model used. suggest any other aspects worth testing as well.


