# FULL CODE

In [8]:
def load_data():
    ## importing the necessary library 
    import pandas as pd
    import numpy as np
    import datetime
    import seaborn as sns
    import matplotlib.pyplot as plt
    %matplotlib notebook
    import warnings
    warnings.filterwarnings("ignore")

    ### reading the data as dataframe
    df = pd.read_excel('shoes.xlsx',sheet_name="Sheet2")

    ### deleting the unwanted column
    del(df["marketplace"])
    del(df["customer_id"])
    del(df["review_id"])
    del(df["product_parent"])
    del(df["vine"])
    del(df["review_headline"])
    del(df["total_votes"])
    del(df["product_category"])

    #### create a brand subset
    df['product_title']=df['product_title'].apply(lambda x: x.lower())

    conditions =[(df['product_title'].str.contains('adidas')),
                (df['product_title'].str.contains('crocs')),
                (df['product_title'].str.contains('skechers'))]
    values=['adidas','crocs','skechers']
    df['Brand']=np.select(conditions,values)


    #### labeling the data using star rating
    df["verified_purchase"]=df.verified_purchase.map({'Y':1,'N':0})
    df["positivity"] = df["star_rating"].apply(lambda x: 2 if x>3 else(0 if x==3 else 1))

    ### #Text Cleaning
    # 1.1 Define preprocess function
    df["review_body"] = df["review_body"].astype("str")
    import string
    import nltk
    nltk.download('words')
    words = set(nltk.corpus.words.words())
    stopwords = nltk.corpus.stopwords.words('english')
    new_stopwords = ["i've","i'm",'on','ie','thesefor','im']
    stopwords.extend(new_stopwords)
    import re
    wn=nltk.WordNetLemmatizer()


    def removing_punc(ele):
        # Convert the text into lowercase
        ele = ele.lower()
        #punctuation
        ele = re.sub('[%s]' % re.escape(string.punctuation), '', ele)
        # number
        ele = re.sub(r'[0-9]', '', ele)
        #new line
        ele = re.sub('\n', '', ele)
        #white space
        ele= re.sub("^\s+", "", ele)
        return ele
    df["review_body"]=df["review_body"].apply(lambda x: removing_punc(x))


    def tokenize(txt):
        """tokenize each word by using split() function"""
        tokens=re.split('\W+', txt)
        return tokens
    df['tokenized_message']=df['review_body'].apply(lambda x: tokenize(x))

    def clean_word(txt_tokenized):
        """removed the stopword and remove the numbers and get the base word using lemmatize function"""
        new_word = [word for word in txt_tokenized if word not in stopwords]
        new_word = [word for word in new_word if word.isalpha()]
        new_word = [word for word in new_word if word in words]
        new_word = [wn.lemmatize(word) for word in new_word]
        return " ".join(new_word)
    df['st_cleaned_message']=df['tokenized_message'].apply(lambda x:clean_word(x))
    return df


def switch_fun(choice, t):
    ''' based on brand of shoes the corresponding function calls'''
    if choice == 1 and t == 1:
        df=load_data()
        x1,y1=adidas(df)
        rf_trs,rf_ts,rf_vals = random_forest(x1,y1)
    elif choice == 1 and t == 2:
        df=load_data()
        x1,y1=adidas(df)
        lg_trs,lg_ts,lg_vals = logistic_regression(x1,y1)
    elif choice == 1 and t == 3:
        df=load_data()
        x1,y1=adidas(df)
        dt_trs,dt_ts,dt_vals = decision_tree(x1,y1)
    elif choice == 1 and t == 4:
        df=load_data()
        x1,y1=adidas(df)
        nb_trs,nb_ts,nb_vals = nb(x1,y1)
    elif choice == 2 and t == 1:
        df=load_data()
        x1,y1=skechers(df)
        srf_trs,srf_ts,srf_vals = random_forest(x1,y1)
    elif choice == 2 and t == 2:
        df=load_data()
        x1,y1=skechers(df)
        slg_trs,slg_ts,slg_vals = logistic_regression(x1,y1)
    elif choice == 2 and t == 3:
        df=load_data()
        x1,y1=skechers(df)
        sdt_trs,sdt_ts,sdt_vals = decision_tree(x1,y1)
    elif choice == 2 and t == 4:
        df=load_data()
        x1,y1=skechers(df)
        snb_trs,snb_ts,snb_vals = nb(x1,y1)
    elif choice == 3 and t == 1:
        df=load_data()
        x1,y1=crocs(df)
        crf_trs,crf_ts,crf_vals = random_forest(x1,y1)
    elif choice == 3 and t == 2:
        df=load_data()
        x1,y1=crocs(df)
        clg_trs,clg_ts,clg_vals = logistic_regression(x1,y1)
    elif choice == 3 and t == 3:
        df=load_data()
        x1,y1=crocs(df)
        cdt_trs,cdt_ts,cdt_vals = decision_tree(x1,y1)
    elif choice == 3 and t == 4:
        df=load_data()
        x1,y1=crocs(df)
        cnb_trs,cnb_ts,cnb_vals = nb(x1,y1)
    
    else: 
        unknown_action()
    
def unknown_action():
    '''if the user input is wrong'''
    print('invalid entry')
    print('Please enter 1 or 2 or 3 for brand reviews')
    print('Please enter 1 or 2 or 3 or 4 for machine learning model')
    return 0


def adidas(dataset):
    ### extracting only the adidas shoe brand reviews from the dataset
    adidas = dataset[dataset["Brand"]=="adidas"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(adidas['product_title'])
    del(adidas['review_body'])
    del(adidas['Brand'])
    del(adidas["review_date"])
    del(adidas['star_rating'])
    del(adidas['tokenized_message'])

    
    ## define size of vocabulary and max length of the sentence in the review dataset
    vocab_size = 4156
    max_length = 291
    
    ## name the y variable as sentiment
    import numpy as np
    sentiment = np.array(adidas['positivity'])
    sentiment

    from tensorflow.keras.preprocessing.text import one_hot
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    ## one hot encoding for the words
    encoded_reviews = [one_hot(d, vocab_size) for d in adidas['st_cleaned_message']]

    ## padding 
    padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')

    ## define X and y variable
    x=padded_reviews
    y=sentiment
    return x,y


def skechers(dataset):
    ### extracting only the adidas shoe brand reviews from the dataset
    skechers = dataset[dataset["Brand"]=="skechers"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(skechers['product_title'])
    del(skechers['review_body'])
    del(skechers['Brand'])
    del(skechers["review_date"])
    del(skechers['star_rating'])
    del(skechers['tokenized_message'])

    
    ## define size of vocabulary and max length of the sentence in the review dataset
    vocab_size = 4800
    max_length = 330
    
    ## name the y variable as sentiment
    import numpy as np
    sentiment = np.array(skechers['positivity'])
    sentiment

    from tensorflow.keras.preprocessing.text import one_hot
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    ## one hot encoding for the words
    encoded_reviews = [one_hot(d, vocab_size) for d in skechers['st_cleaned_message']]

    ## padding 
    padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')

    ## define X and y variable
    x=padded_reviews
    y=sentiment
    return x,y

def crocs(dataset):
    
    ### extracting only the adidas shoe brand reviews from the dataset
    crocs = dataset[dataset["Brand"]=="crocs"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(crocs['product_title'])
    del(crocs['review_body'])
    del(crocs['Brand'])
    del(crocs["review_date"])
    del(crocs['star_rating'])
    del(crocs['tokenized_message'])

    
    ## define size of vocabulary and max length of the sentence in the review dataset
    vocab_size = 5032
    max_length = 235
    
    ## name the y variable as sentiment
    import numpy as np
    sentiment = np.array(crocs['positivity'])
    sentiment

    from tensorflow.keras.preprocessing.text import one_hot
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    ## one hot encoding for the words
    encoded_reviews = [one_hot(d, vocab_size) for d in crocs['st_cleaned_message']]

    ## padding 
    padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')

    ## define X and y variable
    x=padded_reviews
    y=sentiment
    return x,y



    
def random_forest(x1,y1):
    
    # split the dataset as train and test for evalution
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.20, random_state = 0)
    
    from collections import Counter
    print('Before balancing the data')
    print(sorted(Counter(y_train).items()))
    from imblearn.combine import SMOTETomek
    smote_tomek = SMOTETomek(random_state=0)
    x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)
    print('After balancing the data')
    print('class 0 --> neutral\n class 1 --> negative \n class 2 --> positive')
    print(sorted(Counter(y_resampled).items()))

    
    # RANDOM FOREST CLASSIFIER
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score,KFold,cross_validate,GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report

    random_grid = {'max_depth': [10, 20],
                 'max_features': ['auto', 'sqrt'],
                 'n_estimators': [ 15, 10]}

    randomforest = GridSearchCV(RandomForestClassifier(),random_grid,cv=5)
    randomforest_fit = randomforest.fit(x_resampled,y_resampled)


    ##### best estimator
    best_rf_model = randomforest_fit.best_estimator_
    print('the best hyperparameters:',best_rf_model)

    ##### cross validation score
    kfold = StratifiedKFold(n_splits=5)
    scores = cross_val_score(best_rf_model, x_resampled, y_resampled, cv=kfold, scoring='accuracy')
    rf_validation_score = scores.mean()

    ### fit the model
    from sklearn.metrics import accuracy_score
    best_rf_model.fit(x_resampled,y_resampled)

    ### predict the model
    y_pred_train = best_rf_model.predict(x_resampled)
    y_pred_test = best_rf_model.predict(x_test)
    rf_train_score = accuracy_score(y_pred_train,y_resampled)
    rf_test_score = accuracy_score(y_pred_test,y_test)

    print('train accuracy:',rf_train_score)
    print('test accuracy:',rf_test_score)
    print('validation score:',rf_validation_score)
    print('********************************************')
    print(' classification report:\n',classification_report(y_pred_test,y_test,digits=3))
    return rf_train_score,rf_test_score,rf_validation_score

def logistic_regression(x1,y1):
     # split the dataset as train and test for evalution
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.20, random_state = 0)
    
    from collections import Counter
    print('Before balancing the data')
    print(sorted(Counter(y_train).items()))
    from imblearn.combine import SMOTETomek
    smote_tomek = SMOTETomek(random_state=0)
    x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)
    print('After balancing the data')
    print('class 0 --> neutral\n class 1 --> negative \n class 2 --> positive')
    print(sorted(Counter(y_resampled).items()))

    ### LOGISTIC REGRESSION CLASSIFIER
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score,KFold,cross_validate,GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report

    ##### hyperparameter tuning
    param_grid = {'C' : [0.1, 0.2, 0.3, 0.4, 0.5]}
    logistic = GridSearchCV(LogisticRegression(),param_grid,cv=5)
    logistic_fit = logistic.fit(x_resampled,y_resampled)


    ##### best estimator
    best_lg_model = logistic_fit.best_estimator_
    print('the best hyperparameters:',best_lg_model)

    #### cross validation score
    kfold = StratifiedKFold(n_splits=5)
    scores = cross_val_score(best_lg_model, x_resampled, y_resampled, cv=kfold, scoring='accuracy')
    lg_validation_score = scores.mean()

    ### fit the model
    from sklearn.metrics import accuracy_score
    best_lg_model.fit(x_resampled,y_resampled)

    ### predict the model
    y_pred_train = best_lg_model.predict(x_resampled)
    y_pred_test = best_lg_model.predict(x_test)
    lg_train_score = accuracy_score(y_pred_train,y_resampled)
    lg_test_score = accuracy_score(y_pred_test,y_test)

    print('train accuracy:',lg_train_score)
    print('test accuracy:',lg_test_score)
    print('validation score:',lg_validation_score)
    print('********************************************')
    print(' classification report:\n',classification_report(y_pred_test,y_test,digits=3))
    return lg_train_score,lg_test_score,lg_validation_score

def decision_tree(x1,y1):
     # split the dataset as train and test for evalution
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.20, random_state = 0) 
    
    from collections import Counter
    print('Before balancing the data')
    print(sorted(Counter(y_train).items()))
    from imblearn.combine import SMOTETomek
    smote_tomek = SMOTETomek(random_state=0)
    x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)
    print('After balancing the data')
    print('class 0 --> neutral\n class 1 --> negative \n class 2 --> positive')
    print(sorted(Counter(y_resampled).items()))
    
    
    ## DECISION TREE CLASSIFIER
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score,KFold,cross_val_predict,cross_validate,GridSearchCV
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import classification_report

    ##### hyperparameter tuning
    tree_para = {'criterion':['gini','entropy'],'max_depth':[1,2,3,4,5,20,30,40,50,70,90,120,150]}
    decision_tree = GridSearchCV(DecisionTreeClassifier(), tree_para,scoring='accuracy')
    decision_tree_fit =decision_tree.fit(x_resampled,y_resampled)

    ##### best estimator
    best_dt_model = decision_tree_fit.best_estimator_
    print('the best hyperparameters:',best_dt_model)

    #### cross validation score
    kfold = StratifiedKFold(n_splits=5)
    scores = cross_val_score(best_dt_model, x_resampled, y_resampled, cv=kfold, scoring='accuracy')
    dt_validation_score = scores.mean()

    ### fit the model
    from sklearn.metrics import accuracy_score
    best_dt_model.fit(x_resampled,y_resampled)

    ### predict the model
    y_pred_train = best_dt_model.predict(x_resampled)
    y_pred_test = best_dt_model.predict(x_test)
    dt_train_score = accuracy_score(y_pred_train,y_resampled)
    dt_test_score = accuracy_score(y_pred_test,y_test)

    print('train accuracy:',dt_train_score)
    print('test accuracy:',dt_test_score)
    print('validation score:',dt_validation_score)
    print('********************************************')
    print(' classification report:\n',classification_report(y_pred_test,y_test,digits=3))
    return dt_train_score,dt_test_score,dt_validation_score

def nb(x1,y1):
     # split the dataset as train and test for evalution
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.20, random_state = 0)
    
    from collections import Counter
    print('Before balancing the data')
    print(sorted(Counter(y_train).items()))
    from imblearn.combine import SMOTETomek
    smote_tomek = SMOTETomek(random_state=0)
    x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)
    print('After balancing the data')
    print('class 0 --> neutral\n class 1 --> negative \n class 2 --> positive')
    print(sorted(Counter(y_resampled).items()))
    
    ###NAIVE BAYES
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score,KFold,cross_validate,GridSearchCV
    from sklearn.naive_bayes import GaussianNB,MultinomialNB
    from sklearn.metrics import classification_report
    import numpy as np

    params_NB = {'alpha': (0.1,0.2)}

    nb = GridSearchCV(MultinomialNB(),params_NB,scoring='accuracy')
    nb_fit = nb.fit(x_resampled,y_resampled)

    ##### best estimator
    best_nb_model = nb_fit.best_estimator_
    print('the best hyperparameters:',best_nb_model)

    ### cross validation score
    kfold = StratifiedKFold(n_splits=5)
    scores = cross_val_score(best_nb_model, x_resampled, y_resampled, cv=kfold, scoring='accuracy')
    nb_validation_score = scores.mean()

    ### fit the model
    from sklearn.metrics import accuracy_score
    best_nb_model.fit(x_resampled,y_resampled)

    ### predict the model
    y_pred_train = best_nb_model.predict(x_resampled)
    y_pred_test = best_nb_model.predict(x_test)
    nb_train_score = accuracy_score(y_pred_train,y_resampled)
    nb_test_score = accuracy_score(y_pred_test,y_test)

    print('train accuracy:',nb_train_score)
    print('test accuracy:',nb_test_score)
    print('validation score:',nb_validation_score)
    print('********************************************')
    print(' classification report:\n',classification_report(y_pred_test,y_test,digits=3))
    return nb_train_score,nb_test_score,nb_validation_score
    
        
print('Choose brand based dataset to know about Machine Learning model')
print('choose the below options')
print('\n 1.adiads Brand reviews \n 2.skechers Brand reviews \n 3.crocs Brand Reviews')
try:
    r = int(input("Enter number of your choice : ")) 
    
except TypeError:
        print("TypeError")
except:
    print('invalid entry')
    
print('\n**************************************************************************')
print('To run the particular classifier model')
print('choose the below options')
print('\n 1.Random Forest classifier \
        \n 2.Logistic Regression classifier \
        \n 3.Decision tree classifier \
        \n 4.Naive Bayes')
try:
    m = int(input("Enter number of your choice : ")) 
    
except TypeError:
        print("TypeError")
except:
    print('invalid entry')
switch_fun(r,m)        

Choose brand based dataset to know about Machine Learning model
choose the below options

 1.adiads Brand reviews 
 2.skechers Brand reviews 
 3.crocs Brand Reviews
Enter number of your choice : 3

**************************************************************************
To run the particular classifier model
choose the below options

 1.Random Forest classifier         
 2.Logistic Regression classifier         
 3.Decision tree classifier         
 4.Naive Bayes
Enter number of your choice : 4


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ndhiv\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Before balancing the data
[(0, 1129), (1, 1154), (2, 10756)]
After balancing the data
class 0 --> neutral
 class 1 --> negative 
 class 2 --> positive
[(0, 10603), (1, 10613), (2, 10560)]
the best hyperparameters: MultinomialNB(alpha=0.1)
train accuracy: 0.3725767875125881
test accuracy: 0.6843558282208589
validation score: 0.3728285335006256
********************************************
 classification report:
               precision    recall  f1-score   support

           0      0.109     0.087     0.097       344
           1      0.114     0.085     0.097       366
           2      0.800     0.851     0.824      2550

    accuracy                          0.684      3260
   macro avg      0.341     0.341     0.340      3260
weighted avg      0.650     0.684     0.666      3260

