# FULL CODE

In [8]:
def load_data():
    ## importing the necessary library 
    import pandas as pd
    import numpy as np
    import datetime
    import seaborn as sns
    import matplotlib.pyplot as plt
    %matplotlib notebook
    import warnings
    warnings.filterwarnings("ignore")

    ### reading the data as dataframe
    df = pd.read_excel('shoes.xlsx',sheet_name="Sheet2")

    ### deleting the unwanted column
    del(df["marketplace"])
    del(df["customer_id"])
    del(df["review_id"])
    del(df["product_parent"])
    del(df["vine"])
    del(df["review_headline"])
    del(df["total_votes"])
    del(df["product_category"])

    #### create a brand subset
    df['product_title']=df['product_title'].apply(lambda x: x.lower())

    conditions =[(df['product_title'].str.contains('adidas')),
                (df['product_title'].str.contains('crocs')),
                (df['product_title'].str.contains('skechers'))]
    values=['adidas','crocs','skechers']
    df['Brand']=np.select(conditions,values)


    #### labeling the data using star rating
    df["verified_purchase"]=df.verified_purchase.map({'Y':1,'N':0})
    df["positivity"] = df["star_rating"].apply(lambda x: 2 if x>3 else(0 if x==3 else 1))

    ### #Text Cleaning
    # 1.1 Define preprocess function
    df["review_body"] = df["review_body"].astype("str")
    import string
    import nltk
    #nltk.download('words')
    words = set(nltk.corpus.words.words())
    stopwords = nltk.corpus.stopwords.words('english')
    new_stopwords = ["i've","i'm",'on','ie','thesefor','im']
    stopwords.extend(new_stopwords)
    import re
    wn=nltk.WordNetLemmatizer()


    def removing_punc(ele):
        # Convert the text into lowercase
        ele = ele.lower()
        #punctuation
        ele = re.sub('[%s]' % re.escape(string.punctuation), '', ele)
        # number
        ele = re.sub(r'[0-9]', '', ele)
        #new line
        ele = re.sub('\n', '', ele)
        #white space
        ele= re.sub("^\s+", "", ele)
        return ele
    df["review_body"]=df["review_body"].apply(lambda x: removing_punc(x))


    def tokenize(txt):
        """tokenize each word by using split() function"""
        tokens=re.split('\W+', txt)
        return tokens
    df['tokenized_message']=df['review_body'].apply(lambda x: tokenize(x))

    def clean_word(txt_tokenized):
        """removed the stopword and remove the numbers and get the base word using lemmatize function"""
        new_word = [word for word in txt_tokenized if word not in stopwords]
        new_word = [word for word in new_word if word.isalpha()]
        new_word = [word for word in new_word if word in words]
        new_word = [wn.lemmatize(word) for word in new_word]
        return " ".join(new_word)
    df['st_cleaned_message']=df['tokenized_message'].apply(lambda x:clean_word(x))
    return df


def switch_fun(choice, t):
    ''' based on brand of shoes the corresponding function calls'''
    
    from sklearn.svm import SVC
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.linear_model import SGDClassifier

    if choice == 1 and t == 1:
        df=load_data()
        x1,y1=adidas(df)
        rf = RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=15)
        rf_cval,rf_tra,rf_te = machine_learning_model(x1,y1,rf)
    elif choice == 1 and t == 2:
        df=load_data()
        x1,y1=adidas(df)
        dt=DecisionTreeClassifier(max_depth=20)
        dt_cval,dt_tra,dt_te = machine_learning_model(x1,y1,dt)
    elif choice == 1 and t == 3:
        df=load_data()
        x1,y1=adidas(df)
        sgd=SGDClassifier(loss="log", penalty="l2")
        sgd_cval,sgd_tra,sgd_te = machine_learning_model(x1,y1,sgd)
    elif choice == 1 and t == 4:
        df=load_data()
        x1,y1=adidas(df)
        nb=MultinomialNB(alpha=0.2)
        nb_cval,nb_tra,nb_te = machine_learning_model(x1,y1,nb)
            
    elif choice == 2 and t == 1:
        df=load_data()
        x1,y1=skechers(df)
        rf = RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=15)
        rf_cval_s,rf_tra_s,rf_te_s = machine_learning_model(x1,y1,rf)       
    elif choice == 2 and t == 2:
        df=load_data()
        x1,y1=skechers(df)
        dt=DecisionTreeClassifier(max_depth=20)
        dt_cval_s,dt_tra_s,dt_te_s = machine_learning_model(x1,y1,dt)        
    elif choice == 2 and t == 3:
        df=load_data()
        x1,y1=skechers(df)
        sgd=SGDClassifier(loss="log", penalty="l2")
        sgd_cval_s,sgd_tra_s,sgd_te_s = machine_learning_model(x1,y1,sgd)
    elif choice == 2 and t == 4:
        df=load_data()
        x1,y1=skechers(df)
        nb=MultinomialNB(alpha=0.2)
        nb_cval_s,nb_tra_s,nb_te_s = machine_learning_model(x1,y1,nb)
   
    elif choice == 3 and t == 1:
        df=load_data()
        x1,y1=crocs(df)
        rf = RandomForestClassifier(max_depth=10, max_features='sqrt', n_estimators=15)
        rf_cval_c,rf_tra_c,rf_te_c = machine_learning_model(x1,y1,rf)       
    elif choice == 3 and t == 2:
        df=load_data()
        x1,y1=crocs(df)
        dt=DecisionTreeClassifier(max_depth=20)
        dt_cval_c,dt_tra_c,dt_te_c = machine_learning_model(x1,y1,dt)                
    elif choice == 3 and t == 3:
        df=load_data()
        x1,y1=crocs(df)
        sgd=SGDClassifier(loss="log", penalty="l2")
        sgd_cval_c,sgd_tra_c,sgd_te_c = machine_learning_model(x1,y1,sgd)
    elif choice == 3 and t == 4:
        df=load_data()
        x1,y1=crocs(df)
        nb=MultinomialNB(alpha=0.2)
        nb_cval_c,nb_tra_c,nb_te_c = machine_learning_model(x1,y1,nb)    
    
    else: 
        unknown_action()
    
def unknown_action():
    '''if the user input is wrong'''
    print('invalid entry')
    print('Please enter 1 or 2 or 3 for brand reviews')
    print('Please enter 1 or 2 or 3 or 4 for machine learning model')
    return 0


def adidas(dataset):
    ### extracting only the adidas shoe brand reviews from the dataset
    adidas = dataset[dataset["Brand"]=="adidas"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(adidas['product_title'])
    del(adidas['review_body'])
    del(adidas['Brand'])
    del(adidas["review_date"])
    del(adidas['star_rating'])
    del(adidas['tokenized_message'])
    
    from sklearn.feature_extraction.text import CountVectorizer
    #instantiate CountVectorizer() 
    #no of feature is 7391
    cv=CountVectorizer(max_features=6000) 

    # this steps generates word counts for the words 
    word_count_vector=cv.fit_transform(adidas['st_cleaned_message'])
    x=word_count_vector.toarray()
    y=adidas['positivity']
    return x,y


def skechers(dataset):
    ### extracting only the adidas shoe brand reviews from the dataset
    skechers = dataset[dataset["Brand"]=="skechers"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(skechers['product_title'])
    del(skechers['review_body'])
    del(skechers['Brand'])
    del(skechers["review_date"])
    del(skechers['star_rating'])
    del(skechers['tokenized_message'])
    
    from sklearn.feature_extraction.text import CountVectorizer
    #instantiate CountVectorizer() 
    # no of features is 6000
    cv=CountVectorizer(max_features=5000) 

    # this steps generates word counts for the words 
    word_count_vector=cv.fit_transform(skechers['st_cleaned_message'])
    x=word_count_vector.toarray()
    y=skechers['positivity']
    return x,y

def crocs(dataset):
    
    ### extracting only the adidas shoe brand reviews from the dataset
    crocs = dataset[dataset["Brand"]=="crocs"].sort_values(by=["review_date"], ascending=False)
    
    ### Removing unwanted column
    del(crocs['product_title'])
    del(crocs['review_body'])
    del(crocs['Brand'])
    del(crocs["review_date"])
    del(crocs['star_rating'])
    del(crocs['tokenized_message'])
    
    from sklearn.feature_extraction.text import CountVectorizer
    #instantiate CountVectorizer() 
    # no of features is 6000
    cv=CountVectorizer(max_features=5000) 

    # this steps generates word counts for the words 
    word_count_vector=cv.fit_transform(crocs['st_cleaned_message'])
    x=word_count_vector.toarray()
    y=crocs['positivity']
    return x,y



    
def machine_learning_model(x1,y1,mla):
    
    # split the dataset as train and test for evalution
    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.20, random_state = 42)
    print(x_train.shape,y_train.shape)
    
    from collections import Counter
    print('Before balancing the data')
    print(sorted(Counter(y_train).items()))
    from imblearn.combine import SMOTETomek
    smote_tomek = SMOTETomek(random_state=0)
    x_resampled, y_resampled = smote_tomek.fit_resample(x_train, y_train)
    print('After balancing the data')
    print('class 0 --> neutral\n class 1 --> negative \n class 2 --> positive')
    print(sorted(Counter(y_resampled).items()))
    
    # RANDOM FOREST CLASSIFIER
    from sklearn.pipeline import Pipeline
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.model_selection import StratifiedKFold,cross_validate,cross_val_score
    from sklearn.metrics import classification_report
    
    text_clf1 = Pipeline([ ('tfidf', TfidfTransformer()),('clf', mla)])
    print(text_clf1)
    kfold = StratifiedKFold(n_splits=5)
    scores = cross_val_score(text_clf1, x_resampled, y_resampled, cv=kfold, scoring='accuracy')
    validation_score = scores.mean()
    print('cross validation:',validation_score)

    ### fit the model
    from sklearn.metrics import accuracy_score
    text_clf1.fit(x_resampled,y_resampled)
    ### prediction
    y_pred_train = text_clf1.predict(x_resampled)
    y_pred_test = text_clf1.predict(x_test)
    train_score = accuracy_score(y_pred_train,y_resampled)
    test_score = accuracy_score(y_pred_test,y_test)
    print('train score:',train_score)
    print('test score:',test_score)
    print('********************************************')
    print(' classification report:\n',classification_report(y_pred_test,y_test,digits=3))

    return validation_score,train_score,test_score

        
print('Choose brand based dataset to know about Machine Learning model')
print('choose the below options')
print('\n 1.adiads Brand reviews \n 2.skechers Brand reviews \n 3.crocs Brand Reviews')
try:
    r = int(input("Enter number of your choice : ")) 
    
except TypeError:
        print("TypeError")
except:
    print('invalid entry')
    
print('\n**************************************************************************')
print('To run the particular classifier model')
print('choose the below options')
print('\n 1.Random Forest classifier \
        \n 2.Decision Tree classifier \
        \n 3.SGD classifier\
        \n 4.Naive Bayes')
try:
    m = int(input("Enter number of your choice : ")) 
    
except TypeError:
        print("TypeError")
except:
    print('invalid entry')
switch_fun(r,m)        

Choose brand based dataset to know about Machine Learning model
choose the below options

 1.adiads Brand reviews 
 2.skechers Brand reviews 
 3.crocs Brand Reviews
Enter number of your choice : 3

**************************************************************************
To run the particular classifier model
choose the below options

 1.Random Forest classifier         
 2.Decision Tree classifier         
 3.SGD classifier        
 4.Naive Bayes
Enter number of your choice : 4
(13039, 5000) (13039,)
Before balancing the data
[(0, 1115), (1, 1126), (2, 10798)]
After balancing the data
class 0 --> neutral
 class 1 --> negative 
 class 2 --> positive
[(0, 10795), (1, 10796), (2, 10795)]
Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.2))])
cross validation: 0.6646105298712242
train score: 0.6930463780645958
test score: 0.7309815950920245
********************************************
 classification report:
               precision    recall  