In [None]:
import re
import os
import pandas as pd
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from nltk import pos_tag
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from sklearn.feature_selection import RFE

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

np.random.seed(0)

In [None]:
os.chdir(r'C:\Users\oyo\Desktop')
df = pd.read_excel('training_29_6.xlsx',encoding = 'ISO-8859-1')
df.head()

In [None]:
stemmer2 = SnowballStemmer('english')
## preprocessing text:
stop_words = set(stopwords.words('english'))
filter_SW = {"very","until","out","than",'ain','against','aren',"aren't","arent",'couldn',"couldn't","couldnt","didn","didn't","didnt","doesn","doesn't","doesnt","don","don't","dont","hadn","hadn't","hadnt","hasn","hasn't","haven","haven't","hasnt","isn","isn't","isnt","mightn't","mightnt","mightn","mustn","mustn't","mustnt","needn","needn't","neednt","no","not","nor","off","shan","shan't","shant","shouldn","shouldn't","shouldnt","wasn't","wasnt","wasn","weren","weren't","werent","won't","wont","won","wouldn","wouldn't","wouldnt"}
Nstop_words = stop_words - filter_SW
punct = ""
for i in string.punctuation:
    if(i!="'" ):
        punct = punct+i
#print(punct)

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def preprocessing_text(text,lemma=True):
    text = text.lower()
    #replacing apostrophe with none
    text = text.replace("’","")
    text = text.replace("'","")
    text = text.replace('/','')
    #replacing special characters with a space
    text = re.sub('[^A-Za-z]+', " ", text)
    
    #replacing newline,tabs with none
    text = re.sub(r"[\n\t]*", "", text)
    
    #removing multiple sapces
    text = re.sub(" +"," ", text)
    
    #removing punctuations except apostrophe
    text = [word.strip(punct) for word in text.split(" ")]
    
    # remove words that contain numbers
    #text = [word for word in text if not any(c.isdigit() for c in word)]
    
    #removing stopwords
    text = [x for x in text if x not in Nstop_words]
    
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    

    
    # lemmatize text
    if lemma==True:
        # pos tag text
        pos_tags = pos_tag(text)
        text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
        
    else:
        ps = PorterStemmer()
        text =  [stemmer2.stem(word) for word in text]
        
    text = [t for t in text if len(t) > 1]
    text = " ".join(text)
    
    return(text)

In [None]:
stem1 = []
for i in range(len(df)):
    text = str(df['Review'][i])
    t = preprocessing_text(text, lemma = False)
    stem1.append(t)

In [None]:
t_Vector = TfidfVectorizer(max_features=2000, min_df=0.001, max_df=0.5,stop_words=Nstop_words,ngram_range = (1,4))
x = t_Vector.fit_transform(stem1)


In [None]:
x.shape
#t_Vector.get_feature_names()


In [None]:
#Label encoding L2 tags

df_cols = df[['L2 AC/Heater', 'L2 Check-in Experience',
       'L2 Comfort & Safety', 'L2 Food Experience', 'L2 Hotel Infrastructure',
       'L2 Hygiene & Cleanliness', 'L2 Room Equipment & Amenities', 'L2 Staff & Service',
       'L2 TV & WiFi', 'L2 Washroom']]
dict_labels = {'y1':'L2 AC/Heater',
'y2':'L2 Check-in Experience',
'y3':'L2 Comfort & Safety',
'y4':'L2 Food Experience',
'y5':'L2 Hotel Infrastructure',
'y6':'L2 Hygiene & Cleanliness',
'y7':'L2 Room Equipment & Amenities',
'y8':'L2 Staff & Service',
'y9':'L2 TV & WiFi',
'y10':'L2 Washroom',
}
for i in dict_labels:
    text = df_cols[dict_labels[i]]
    encode = LabelEncoder()
    df[dict_labels[i]] = encode.fit_transform(text.astype(str))        
        
    

In [None]:
dict_labels_L1 = {'y1':'AC/Heater',
'y2':'Check-in Experience',
'y3':'Comfort & Safety',
'y4':'Food Experience',
'y5':'Hotel Infrastructure',
'y6':'Hygiene & Cleanliness',
'y7':'Room Equipment & Amenities',
'y8':'Staff & Service',
'y9':'TV & WiFi',
'y10':'Washroom',
}
dict_labels_L2 = {'y1':'L2 AC/Heater',
'y2':'L2 Check-in Experience',
'y3':'L2 Comfort & Safety',
'y4':'L2 Food Experience',
'y5':'L2 Hotel Infrastructure',
'y6':'L2 Hygiene & Cleanliness',
'y7':'L2 Room Equipment & Amenities',
'y8':'L2 Staff & Service',
'y9':'L2 TV & WiFi',
'y10':'L2 Washroom',
}
for i in dict_labels_L1:
    print("-------"+dict_labels_L1[i]+"-------")
    df1  = pd.DataFrame()
    df1 = df1.append(df[df[dict_labels_L1[i]] == 1])
    #DF1 = DF1.append(final[final[dict_labels_L1[i]] == 1])
    df1 = df1.reset_index()
    #DF1 = DF1.reset_index()
    #print(len(DF1))
    y = df1[dict_labels_L2[i]]
    #y1 = DF1[dict_labels_L2[i]]
    #y = y.replace(np.nan, 0)
    #y = y.fillna(str(0))
    #X,Y = nr.fit_sample(df1['Review'],y)
    stem=[]
    #stem1 = []
    for j in range(len(df1)):
        #t = str(df1['Review'][j])
        t=preprocessing_text(str(df1['Review'][j]),lemma=False)
        stem.append(t)
    
    x = t_Vector.transform(stem)
    x_train,x_test,y_train,Y_test = train_test_split(x,y,test_size=0.15,random_state = 20)
    model = RFE(RandomForestClassifier(max_features = 'auto', criterion = 'gini',bootstrap = True,class_weight = 'balanced'), 100, step=1)
    #model = SVC(kernel = 'rbf',class_weight = 'balanced', decision_function_shape = 'ovr', C = 1000000,gamma = 1e-06 )
    model.fit(x_train,y_train)
    y_score = model.predict(x_test)
    y_true = model.predict(x_train)
    
    print(accuracy_score(Y_test,y_score))
    print(confusion_matrix(Y_test,y_score))
    
    print(accuracy_score(y_train,y_true))
    print(confusion_matrix(y_train,y_true))

    