<a href="https://colab.research.google.com/github/ftxsilva/Data-Challenge-CAPGEMINI/blob/master/Lemmatize_Filter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hackathon Supaero - Restaurants ratings prediction

Predict Yelp restaurants ratings from associated reviews, data about users and restaurants# 0. Packages

# 0. Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
!pip install lime
from lime.lime_text import LimeTextExplainer
import string
import seaborn as sns
import matplotlib.pyplot as plt



# 1. Data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/My Drive/Hackathon-Capgemini/data/train_set.csv")
df_test = pd.read_csv("/content/drive/My Drive/Hackathon-Capgemini/data/test_set.csv")

In [None]:
df = df.dropna(subset=["review"]) # only analysis with review

In [None]:
print(df['useful_user'].quantile(0.25))
print(df['useful_user'].quantile(0.5))
print(df['useful_user'].mean())

6.0
29.0
552.1535579431832


In [None]:
print(df['useful_review'].quantile(0.25))
print(df['useful_review'].quantile(0.5))
print(df['useful_review'].mean())

0.0
0.0
1.1570299965968205


# 2. Data Split

In [None]:
df_train, df_val = train_test_split(
    df, 
    test_size=0.2, 
    random_state=0, 
    stratify=df['binary_target'] # keep same proportion of dataset
)

# 3. Cleaning

In [None]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [None]:
from string import punctuation

from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer
from nltk import wordpunct_tokenize

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

stopw = set(stopwords.words('english'))

# Lowercase
def convert_text_to_lowercase(df, colname):
    df[colname] = df[colname].str.lower()
    return df
    
# Remove punctuation
def remove_punctuation(df,colname):

    def without_punctuation(text):
      return ''.join([c for c in text if c not in punctuation])

    df[colname] = df[colname].apply(lambda review: without_punctuation(review))
    return df

# Tokenize
def tokenize(df,colname):

    tokenizer = RegexpTokenizer(r'\w+')
    df[colname] = df[colname].apply(lambda review: tokenizer.tokenize(review))
    return df

# Remove stopwords - low predictive power
def remove_stopwords(df, colname):

    def without_stopwords(token):
      return [w for w in token if w not in stopw]

    df[colname] = df[colname].apply(lambda review: without_stopwords(review))
    return df

# Remove 1-character words
def remove_1char_words(df, colname):
  
    def without_1char_words(token):
      # return [w for w in token if len(w)>1]
      return ' '.join([w for w in token if len(w)>1])

    df[colname] = df[colname].apply(lambda review: without_1char_words(review))
    return df

# Remove non-alpha
def remove_non_alpha_words(df, colname):
  
    def without_non_alpha_words(text):
      return ' '.join([w for w in text.split() if w.isalpha()])

    df[colname] = df[colname].apply(lambda review: without_non_alpha_words(review))
    return df

# Lemmatizing
def lemmatize(df,colname):

    lemmatizer = WordNetLemmatizer()

    def word_lemmatizer(text):
      return ' '.join([lemmatizer.lemmatize(w) for w in text.split()])

    df[colname] = df[colname].apply(lambda review: word_lemmatizer(review))
    return df

# OR Stemming
def stem(df,colname):

    stemmer = PorterStemmer()

    def word_stemmer(text):
      return ' '.join([stemmer.stem(w) for w in text.split()])

    df[colname] = df[colname].apply(lambda review: word_stemmer(review))
    return df


def text_cleaning(df, colname):
    """
    Takes in a string of text, then performs the following:
    1. convert text to lowercase
    2. ??
    """
    df = (
        df.pipe(convert_text_to_lowercase, colname)
        .pipe(remove_punctuation, colname)
        .pipe(tokenize, colname)
        .pipe(remove_stopwords, colname)
        .pipe(remove_1char_words, colname)
        #.pipe(remove_non_alpha_words, colname)

        # Choose one
        .pipe(lemmatize, colname)
        #.pipe(stem, colname)
    )
    return df

In [None]:

# No filter
df_train_filter = df_train[['review', 'binary_target']]

'''
# Filter "on Strip" - not good

df_train['address'].fillna('', inplace=True)
df_train['in_Strip'] = df_train['address'].str.contains('Las Vegas Blvd').astype(int)

df_train_filter = df_train[df_train['in_Strip'] == 1]


# Filter on useful review
df_train_filter = df_train[df_train['useful_review'] >= 1]
'''

'\n# Filter "on Strip" - not good\n\ndf_train[\'address\'].fillna(\'\', inplace=True)\ndf_train[\'in_Strip\'] = df_train[\'address\'].str.contains(\'Las Vegas Blvd\').astype(int)\n\ndf_train_filter = df_train[df_train[\'in_Strip\'] == 1]\n\n\n# Filter on useful review\ndf_train_filter = df_train[df_train[\'useful_review\'] >= 1]\n'

In [None]:
df_train_clean = text_cleaning(df_train_filter, 'review')
df_val_clean = text_cleaning(df_val, 'review')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pand

# 4. Feature Engineering
## Be creative !

# 5. Model

## Vectorization & model initiation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

Count_Vectorizer = CountVectorizer(max_features=20000)
tfidf = TfidfVectorizer(ngram_range = (1,2))

logit = LogisticRegression(random_state=0, max_iter=500, C=10, class_weight = 'balanced')

pipeline = Pipeline([
    #('vectorizer', Count_Vectorizer),
    ('vectorizer', tfidf),
    ('model', logit)])

## Model fit on train set

In [None]:
x_train = df_train_clean['review']
y_train = df_train_clean['binary_target']

x_val = df_val_clean['review']
y_val = df_val_clean['binary_target']

In [None]:
'''
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer

X = x_train
y = y_train
pipeline.fit(X, y)

# frequency for document (whole review)
parameters = {
   'vectorizer__ngram_range': [(1, 2), (1, 3)],
   'vectorizer__max_df': [1.0, 0.9, 0.8],
   'vectorizer__min_df': [0.0, 0.05, 0.1],
}

grid_search = GridSearchCV(pipeline, parameters, 
                        cv=list(KFold(n_splits=2, shuffle=True).split(X)),
                        verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_

y_pred = pipeline.predict(x_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))
'''

'\nfrom sklearn.model_selection import GridSearchCV\nfrom pprint import pprint\nfrom time import time\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.model_selection import KFold\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nX = x_train\ny = y_train\npipeline.fit(X, y)\n\n# frequency for document (whole review)\nparameters = {\n   \'vectorizer__ngram_range\': [(1, 2), (1, 3)],\n   \'vectorizer__max_df\': [1.0, 0.9, 0.8],\n   \'vectorizer__min_df\': [0.0, 0.05, 0.1],\n}\n\ngrid_search = GridSearchCV(pipeline, parameters, \n                        cv=list(KFold(n_splits=2, shuffle=True).split(X)),\n                        verbose=1)\n\nprint("Performing grid search...")\nprint("pipeline:", [name for name, _ in pipeline.steps])\nprint("parameters:")\npprint(parameters)\nt0 = time()\ngrid_search.fit(X, y)\nprint("done in %0.3fs" % (time() - t0))\nprint()\n\nprint("Best score: %0.3f" % grid_search.best_score_)\nprint("Best parameters set:")\nbest_parameters =

In [None]:
pipeline.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 LogisticRegression(C=10, class_weight='balanced', dual=False,
                                    fit_intercept=True, inter

# 6. Evaluation

In [None]:
y_pred = pipeline.predict(x_val)

In [None]:
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[ 72818   9919]
 [ 11222 152872]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.87     82737
           1       0.94      0.93      0.94    164094

    accuracy                           0.91    246831
   macro avg       0.90      0.91      0.90    246831
weighted avg       0.91      0.91      0.91    246831



# 7. Prediction on test set

In [None]:
df_test_cleaned = text_cleaning(df_test, 'review')
x_test = df_test_cleaned['review']

predictions = pipeline.predict(x_test)

# 8. Formatting & export to csv

In [None]:
soumission = pd.DataFrame({"review_id": df_test['review_id'], "prediction": predictions})

soumission['prediction'] = soumission['prediction'].astype('bool')
soumission['review_id'] = soumission['review_id'].astype('str')

soumission.head().dtypes

soumission.to_csv('/content/drive/My Drive/Hackathon-Capgemini/submissions/submission_erika_3.csv', index=False)

In [None]:
'''
!pip install -q kaggle
!mkdir -p ~/.kaggle
from google.colab import files
files.upload() # Upload the file from your computer here 
               # (you have to download it at https://www.kaggle.com/<Your_Account>/account at the API Section)
!cp kaggle.json ~/.kaggle/
#!kaggle competitions submit -c restaurants-ratings-prediction -f [FILE PATH] -m "[CUSTOM MESSAGE]"
'''

'\n!pip install -q kaggle\n!mkdir -p ~/.kaggle\nfrom google.colab import files\nfiles.upload() # Upload the file from your computer here \n               # (you have to download it at https://www.kaggle.com/<Your_Account>/account at the API Section)\n!cp kaggle.json ~/.kaggle/\n#!kaggle competitions submit -c restaurants-ratings-prediction -f [FILE PATH] -m "[CUSTOM MESSAGE]"\n'

In [None]:
#!kaggle competitions submit -c restaurants-ratings-prediction -f /content/drive/My\ Drive/Hackathon-Capgemini/submissions/submission_erika_3.csv -m "Test submission"

# 9. Model Interpretability - Do not consider for kaggle competition

In [None]:
'''
class_names = [0, 1]
explainer = LimeTextExplainer(class_names=class_names)

def lime_model_interpreter(clf, idx, n_features):
    text_idx = x_val.iloc[idx]
    target_idx = y_val.iloc[idx]

    exp = explainer.explain_instance(text_idx, clf.predict_proba, num_features=n_features)
    print('Document id: %d' % idx)
    print('Probability(True) =', clf.predict_proba([text_idx])[0,1])
    print('True class: %s' % class_names[target_idx])

    exp.show_in_notebook(text=True)
'''    

"\nclass_names = [0, 1]\nexplainer = LimeTextExplainer(class_names=class_names)\n\ndef lime_model_interpreter(clf, idx, n_features):\n    text_idx = x_val.iloc[idx]\n    target_idx = y_val.iloc[idx]\n\n    exp = explainer.explain_instance(text_idx, clf.predict_proba, num_features=n_features)\n    print('Document id: %d' % idx)\n    print('Probability(True) =', clf.predict_proba([text_idx])[0,1])\n    print('True class: %s' % class_names[target_idx])\n\n    exp.show_in_notebook(text=True)\n"

In [None]:
#lime_model_interpreter(pipeline, 2, n_features=6) 

In [None]:
#lime_model_interpreter(pipeline, 7, n_features=6) 