# Hackthon 04

In [3]:
# importing needed packages here

import os
import re
import spacy
import hashlib
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter
from spacy.matcher import Matcher
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier


def _hash(s):
    return hashlib.sha256(
        bytes(str(s), encoding='utf8'),
    ).hexdigest()

cpu_count = int(os.cpu_count()) if os.cpu_count() != None else 4

In [2]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 21 kB/s  eta 0:00:01   |▍                               | 1.9 MB 2.4 MB/s eta 0:01:13     |███████▍                        | 40.3 MB 3.5 MB/s eta 0:00:39     |███████▋                        | 41.3 MB 3.5 MB/s eta 0:00:39     |███████▊                        | 42.0 MB 3.5 MB/s eta 0:00:38     |███████████████▉                | 86.1 MB 3.5 MB/s eta 0:00:25     |████████████████                | 86.3 MB 807 kB/s eta 0:01:48     |█████████████████               | 92.6 MB 4.3 MB/s eta 0:00:19     |█████████████████▌              | 94.9 MB 4.3 MB/s eta 0:00:19     |█████████████████▋              | 95.5 MB 4.1 MB/s eta 0:00:20     |█████████████████████           | 114.0 MB 2.8 MB/s eta 0:00:22     |███████████████████████▊        | 128.8 MB 3.2 MB/s eta 0:00:15     |████████████████████████        | 130.3 MB 2.7 MB/s eta 0:00:17     |██████████████

In [4]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]

In [5]:
def split_dataset(df, feature_names=["reviewText", "rating"], label="label"):
    X_train, X_test, y_train, y_test = train_test_split(df[feature_names], df[label], 
                                                        test_size=0.2, stratify=df["label"])

    return X_train, X_test, y_train, y_test

In [6]:
def train_pipeline(pipe, X_train, y_train):
    """
    Train a Random Forest using sklearn's Pipeline and return the trained model and its accuracy in the test set.
    """
    pipe.fit(X_train, y_train)

In [7]:
def get_predictions(pipe, X_test):
    return pipe.predict(X_test)

In [8]:
def get_scores(y_test, y_pred):
    f1_score_true_label = f1_score(y_test, y_pred)
    classification_report_pred = classification_report(y_test, y_pred)
    confusion_matrix_pred = confusion_matrix(y_test, y_pred)
    
    return f1_score_true_label, classification_report_pred, confusion_matrix_pred

In [9]:
def drop_columns(df_to_process, columns=["reviewerID", "reviewTime", "reviewerName", "summary"]):
    
    df_copy = df_to_process.copy()
    df_copy = df_copy.reset_index()
    df_copy = df_copy.drop(columns=["reviewerID", "reviewTime", "reviewerName", "summary"])

    return df_copy

def drop_columns_taining_set(df_to_process):
    df_copy = df_to_process.copy()
    return df_copy.drop(columns=["overall", "rates_count", "helpful_count", "label_score"])

In [10]:
def y_sub_to_csv(y_sub, name="baseline"):
    submission = pd.DataFrame(y_sub)
    submission.columns=["is_helpful"]
    submission = submission["is_helpful"].astype(str).str.lower()
    return submission.to_csv("preds/" + name + '.csv', index = False)


In [11]:
def describe_label(df, label):
    display(df.loc[df_first_iteration['label'] == label].describe())

In [12]:
def drop_empty_reviewText(df):
    df_copy = df.copy()
    return df_copy.loc[df_copy['reviewText'] != '']

### Removing Stop Words and Punctuation

In [13]:
# Remove Punctuation and Stop Words
tokenizer = WordPunctTokenizer()

def remove_punctuation(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

def remove_stopwords(text, stopwords):            
    tokens = tokenizer.tokenize(text)
    text_processed = " ".join([token for token in tokens if token not in stopwords])
    return text_processed

def preprocess_text(df, column):
    df_processed = df.copy()
    df_processed[column] = df_processed[column].apply(remove_punctuation)
    df_processed[column] = df_processed[column].apply(remove_stopwords, stopwords = en_stopwords)
    return df_processed

def number_of_adv_adj(docs_to_use):
    nb_adj_adv = []
    matcher = Matcher(nlp.vocab) 
    matcher.add("adv_or_adj", [[{"POS": "ADJ"}], [{"POS": "ADV"}]])
    nb_adj_adv = [len(matcher(doc)) for doc in docs_to_use]

    return nb_adj_adv

In [14]:
df_second_iteration = pd.read_csv("processed_labelled_data.csv")
df_second_iteration_unlabelled = pd.read_csv("processed_unlabelled.csv")

In [39]:
def define_second_iteration_pipeline(number_of_features):
    nb_words_pipe = Pipeline([
        ('selector', NumberSelector("nb_words")),
        ('standard', StandardScaler())
    ])

    nb_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("nb_adj_adv")),
        ('standard', StandardScaler())
    ])

    ratio_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("ratio_adj_adv")),
        ('standard', StandardScaler())
    ])

    reviewTestPipeline = Pipeline([
        ('selector', TextSelector("reviewText")),
        ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
        ('feature_selection', SelectKBest(chi2, k=number_of_features))
    ])

    rating = Pipeline([
        ('selector', NumberSelector("rating")),
        ('standard', StandardScaler())
    ])

    feats = FeatureUnion([
        ('reviewTestPipeline', reviewTestPipeline),
        ('rating', rating),
        ('ratio_adj_adv', ratio_adj_adv_pipe),
        ('nb_words', nb_words_pipe),
        ('nb_adj_adv', nb_adj_adv_pipe),
    ])

    second_iteration_pipeline = Pipeline([
        ('features', feats),
        ('classifier', RandomForestClassifier()),
    ])

    return second_iteration_pipeline

In [16]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(df_second_iteration, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv"], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.19      0.50      0.28       696
        True       0.91      0.71      0.80      4989

    accuracy                           0.69      5685
   macro avg       0.55      0.61      0.54      5685
weighted avg       0.82      0.69      0.74      5685

[[ 347  349]
 [1435 3554]]


In [38]:
nb_words_pipe = Pipeline([
    ('selector', NumberSelector("nb_words")),
    ('standard', StandardScaler())
])

nb_adj_adv_pipe = Pipeline([
    ('selector', NumberSelector("nb_adj_adv")),
    ('standard', StandardScaler())
])

ratio_adj_adv_pipe = Pipeline([
    ('selector', NumberSelector("ratio_adj_adv")),
    ('standard', StandardScaler())
])

reviewTestPipeline = Pipeline([
    ('selector', TextSelector("reviewText")),
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('feature_selection', SelectKBest(chi2, k=number_of_features))
])

rating = Pipeline([
    ('selector', NumberSelector("rating")),
    ('standard', StandardScaler())
])

feats = FeatureUnion([
    ('reviewTestPipeline', reviewTestPipeline),
    ('rating', rating),
    ('ratio_adj_adv', ratio_adj_adv_pipe),
    ('nb_words', nb_words_pipe),
    ('nb_adj_adv', nb_adj_adv_pipe),
])

second_iteration_pipeline = Pipeline([
    ('features', feats),
    ('classifier', XGBClassifier()),
])

NameError: name 'number_of_features' is not defined

In [None]:
XGBClassifier(n_estimators=1200, max_depth=25, n_jobs=-1)

In [27]:
df_second_iteration ["doc_length"] = df_second_iteration.reviewText.apply(len)
df_second_iteration ["avg_word_length"] = df_second_iteration.reviewText.apply(lambda x: np.mean([len(word) for word in x.split(" ")]))


In [30]:
def define_second_iteration_pipeline(number_of_features):
    nb_words_pipe = Pipeline([
        ('selector', NumberSelector("nb_words")),
        ('standard', StandardScaler())
    ])

    nb_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("nb_adj_adv")),
        ('standard', StandardScaler())
    ])

    ratio_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("ratio_adj_adv")),
        ('standard', StandardScaler())
    ])

    reviewTestPipeline = Pipeline([
        ('selector', TextSelector("reviewText")),
        ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
        ('feature_selection', SelectKBest(chi2, k=number_of_features))
    ])

    rating = Pipeline([
        ('selector', NumberSelector("rating")),
        ('standard', StandardScaler())
    ])
    
    doc_length_pipe =  Pipeline([
                ('selector', NumberSelector("doc_length")),
                ('standard', StandardScaler())
            ])

    avg_word_length_pipe =  Pipeline([
                ('selector', NumberSelector("avg_word_length")),
                ('standard', StandardScaler())
        ])

    feats = FeatureUnion([
        ('reviewTestPipeline', reviewTestPipeline),
        ('rating', rating),
        ('ratio_adj_adv', ratio_adj_adv_pipe),
        ('nb_words', nb_words_pipe),
        ('nb_adj_adv', nb_adj_adv_pipe),
        ('doc_length', doc_length_pipe),
        ('avg_word_length', avg_word_length_pipe)
    ])

    second_iteration_pipeline = Pipeline([
        ('features', feats),
        ('classifier', RandomForestClassifier()),
    ])

    return second_iteration_pipeline

In [34]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(df_second_iteration, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv", 'doc_length', 'avg_word_length'], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.19      0.54      0.28       634
        True       0.93      0.72      0.81      5051

    accuracy                           0.70      5685
   macro avg       0.56      0.63      0.55      5685
weighted avg       0.84      0.70      0.75      5685

[[ 344  290]
 [1438 3613]]


In [35]:
true_samples = df_second_iteration.loc[df_second_iteration["label"] == True]
false_samples = df_second_iteration.loc[df_second_iteration["label"] == False]
false_samples_size = false_samples.shape[0]

true_samples_rebalance = true_samples.sample(n=false_samples_size)

balanced_df = pd.concat([false_samples, true_samples_rebalance])

In [37]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(balanced_df, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv", 'doc_length', 'avg_word_length'], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.60      0.62      0.61      1705
        True       0.64      0.61      0.63      1859

    accuracy                           0.62      3564
   macro avg       0.62      0.62      0.62      3564
weighted avg       0.62      0.62      0.62      3564

[[1065  640]
 [ 717 1142]]


In [40]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(balanced_df, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv"], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.61      0.61      0.61      1780
        True       0.61      0.61      0.61      1784

    accuracy                           0.61      3564
   macro avg       0.61      0.61      0.61      3564
weighted avg       0.61      0.61      0.61      3564

[[1093  687]
 [ 689 1095]]


In [42]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 2.6 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [43]:
from textblob import TextBlob
df_sa = df_second_iteration.copy()

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

df_sa['polarity'] = df_sa['reviewText'].apply(lambda x: pol(x) )
df_sa['subjectivity'] = df_sa['reviewText'].apply(lambda x: sub(x) )

In [44]:
df_sa

Unnamed: 0.1,Unnamed: 0,reviewText,rating,label,nb_adj_adv,nb_words,ratio_adj_adv,doc_length,avg_word_length,polarity,subjectivity
0,0,jenkins history professor member parliament au...,4,True,49,144,0.340278,1245,7.652778,0.121264,0.434615
1,1,didnt read purchased gift family small childre...,3,False,2,10,0.200000,62,5.300000,0.225000,0.500000
2,2,fierce angels sheri park reads like dissertati...,4,True,41,134,0.305970,976,6.291045,0.041026,0.604487
3,3,clearly author goals mind 1 advantage american...,1,True,15,50,0.300000,355,6.120000,-0.100000,0.688095
4,4,collection stories memories japanese soldiers ...,5,True,39,114,0.342105,882,6.745614,0.076190,0.357738
...,...,...,...,...,...,...,...,...,...,...,...
28417,28418,daniel klein coauthor thomas cathcart wonderfu...,4,True,30,113,0.265487,868,6.690265,0.274407,0.467278
28418,28419,golden book 1962 illustrations great text good...,5,True,10,33,0.303030,238,6.242424,0.495000,0.718393
28419,28420,initially attracted book main characters math ...,5,True,31,106,0.292453,807,6.622642,0.215789,0.478446
28420,28421,intriguing mystery compelling romance charlies...,5,True,18,48,0.375000,345,6.208333,0.324242,0.593939


In [54]:
def define_second_iteration_pipeline(number_of_features):
    nb_words_pipe = Pipeline([
        ('selector', NumberSelector("nb_words")),
        ('standard', StandardScaler())
    ])

    nb_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("nb_adj_adv")),
        ('standard', StandardScaler())
    ])

    ratio_adj_adv_pipe = Pipeline([
        ('selector', NumberSelector("ratio_adj_adv")),
        ('standard', StandardScaler())
    ])

    reviewTestPipeline = Pipeline([
        ('selector', TextSelector("reviewText")),
        ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
        ('feature_selection', SelectKBest(chi2, k=number_of_features))
    ])

    rating = Pipeline([
        ('selector', NumberSelector("rating")),
        ('standard', StandardScaler())
    ])
    
    doc_length_pipe =  Pipeline([
                ('selector', NumberSelector("doc_length")),
                ('standard', StandardScaler())
            ])

    avg_word_length_pipe =  Pipeline([
                ('selector', NumberSelector("avg_word_length")),
                ('standard', StandardScaler())
        ])
    
    polarity_pipe = Pipeline([
        ('selector', NumberSelector("polarity")),
        ('standard', StandardScaler())
        ])

    subjectivity_pipe = Pipeline([
        ('selector', NumberSelector("subjectivity")),
        ('standard', StandardScaler())
        ])


    feats = FeatureUnion([
        ('reviewTestPipeline', reviewTestPipeline),
        ('rating', rating),
        ('ratio_adj_adv', ratio_adj_adv_pipe),
        ('nb_words', nb_words_pipe),
        ('nb_adj_adv', nb_adj_adv_pipe),
        ('doc_length', doc_length_pipe),
        ('avg_word_length', avg_word_length_pipe),
        ('polarity', polarity_pipe),
        ('subjectivity', subjectivity_pipe)
    ])

    second_iteration_pipeline = Pipeline([
        ('features', feats),
        ('classifier', RandomForestClassifier()),
    ])

    return second_iteration_pipeline

In [50]:
df_second_iteration = df_sa
true_samples = df_second_iteration.loc[df_second_iteration["label"] == True]
false_samples = df_second_iteration.loc[df_second_iteration["label"] == False]
false_samples_size = false_samples.shape[0]

true_samples_rebalance = true_samples.sample(n=false_samples_size)

balanced_df = pd.concat([false_samples, true_samples_rebalance])

In [52]:
balanced_df

Unnamed: 0.1,Unnamed: 0,reviewText,rating,label,nb_adj_adv,nb_words,ratio_adj_adv,doc_length,avg_word_length,polarity,subjectivity
1,1,didnt read purchased gift family small childre...,3,False,2,10,0.200000,62,5.300000,0.225000,0.500000
10,10,read thisi gave star wanted attention cos got ...,1,False,12,34,0.352941,204,5.029412,0.371429,0.842857
11,11,wait book triology come interesting type myste...,5,False,5,15,0.333333,92,5.200000,0.250000,0.250000
12,12,bill bryson funny guy complex things language ...,5,False,18,66,0.272727,512,6.772727,0.317500,0.622500
14,14,philip roth authors read matter excellent styl...,5,False,33,125,0.264000,902,6.224000,0.064232,0.642620
...,...,...,...,...,...,...,...,...,...,...,...
11817,11818,admit naving seen sean hannity fox news liked ...,1,True,10,51,0.196078,348,5.843137,0.147619,0.372619
26422,26423,fun read seattle meaningful loved email letter...,5,True,7,15,0.466667,102,5.866667,0.212500,0.733333
11294,11295,diet book fad diet book simply diet try months...,5,True,55,125,0.440000,884,6.080000,0.179309,0.449916
19540,19541,enjoyed humorous insightful book learned new t...,5,True,46,104,0.442308,718,5.913462,0.126124,0.582023


In [55]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(balanced_df, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv", 'doc_length', 'avg_word_length', "polarity", "subjectivity"], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.66      0.63      0.64      1856
        True       0.62      0.64      0.63      1708

    accuracy                           0.64      3564
   macro avg       0.64      0.64      0.64      3564
weighted avg       0.64      0.64      0.64      3564

[[1171  685]
 [ 611 1097]]


In [56]:
#for number_of_feats in [100, 1000, 10000, 25000, 50000]:
for number_of_feats in [5000]:

    second_iteration_pipeline = define_second_iteration_pipeline(number_of_features=number_of_feats)
    X_train, X_test, y_train, y_test = split_dataset(df_second_iteration, feature_names=["reviewText", "rating", "nb_adj_adv", "nb_words", "ratio_adj_adv", 'doc_length', 'avg_word_length', "polarity", "subjectivity"], label="label")
    train_pipeline(second_iteration_pipeline, X_train, y_train)
    y_val = get_predictions(second_iteration_pipeline, X_test)
    f1_score_test, class_report_test, confusion_matrix_test = get_scores(y_val, y_test)
    print("Report for number of features " + str(number_of_feats) + ":")
    print(class_report_test)
    print(confusion_matrix_test)

Report for number of features 5000:
              precision    recall  f1-score   support

       False       0.20      0.55      0.29       637
        True       0.93      0.72      0.81      5048

    accuracy                           0.70      5685
   macro avg       0.56      0.63      0.55      5685
weighted avg       0.84      0.70      0.75      5685

[[ 350  287]
 [1432 3616]]
