In [9]:
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from collections import Counter 
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [10]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [15]:
p_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer ()

In [16]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)      # replace won't with "will not"
    phrase = re.sub(r"can\'t", "can not", phrase)      # replace can or cant with 'can not'
    phrase = re.sub(r"n\'t", " not", phrase)           # replece n with 'not'
    phrase = re.sub(r"\'re", " are", phrase)           # replace re with 'are'
    phrase = re.sub(r"\'s", " is", phrase)             # replace s with 'is'
    phrase = re.sub(r"\'d", " would", phrase)          # replace 'd' with 'would'
    phrase = re.sub(r"\'ll", " will", phrase)          # replace 'll with 'will'
    phrase = re.sub(r"\'t", " not", phrase)            # replace 't' with 'not'
    phrase = re.sub(r"\'ve", " have", phrase)          # replace ve with 'have'
    phrase = re.sub(r"\'m", " am", phrase)             # replace 'm with 'am'
    return phrase

  
def preprocess_text(text_data):
    preprocessed_text = []             
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)           #calling funcion for each sentence
        #print("1st sent" , sent)
        sent = sent.replace('\\r', ' ')         # replace line terminator with space
        sent = sent.replace('\\n', ' ')         # replace new line charactor with space
        sent = sent.replace('\\"', ' ')         
        sent = re.sub('[^A-Za-z]+', ' ', sent)  # remove anything that is not letter
        sent = ''.join(p_stemmer.stem(token) for token in sent )
        sent = ''.join(lemmatizer.lemmatize(token) for token in sent )
        sent  = ' '.join(e for e in sent.split() if len( Counter(e)) > 2 )
        #sent = lstr(emmatize_text(sent)
        
        sent = ' '.join(e for e in sent.split() if e.lower() not in 'root/nltk_data/corpora/stop_words') # checking for stop words
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [17]:
df = pd.read_csv("../Dataset/test - test.csv")

In [41]:
data =df.copy()

In [83]:
g=preprocess_text([df['text'][5]])

100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1003.18it/s]


In [84]:
g

['love the but missing some features likes keeping backlog tasks']

In [42]:
data['text']=preprocess_text(data['text'])

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 636.99it/s]


In [44]:
X1 = data['text']
X2 = data['aspect']

In [46]:
with open("../Models/best_lrc.pickle", 'rb') as data:
    lrc = pickle.load(data)
with open("../Data Engineering/PickleFiles/tfidf1.pickle", 'rb') as data:
    tfidf1 = pickle.load(data)
with open("../Data Engineering/PickleFiles/tfidf2.pickle", 'rb') as data:
    tfidf2 = pickle.load(data)

In [85]:
features_test_tfidf1 = tfidf1.transform(g).toarray()
#features_test_tfidf2 = tfidf2.transform(X2).toarray()

In [86]:
features_test_tfidf1.shape

(1, 3991)

In [49]:
features_test_tfidf1.shape

(1000, 3991)

In [50]:
X = np.concatenate((features_test_tfidf1, features_test_tfidf2), axis=1)

In [51]:
X.shape

(1000, 4887)

In [52]:
y_label = lrc.predict(X)

In [59]:
data1=df.copy()

In [60]:
data1['label']=y_label

In [61]:
data1.head()

Unnamed: 0,text,aspect,label
0,improve your customer service and product avai...,Customer service,0
1,"functionality is great, almost as in desktop v...",mobile version,0
2,but it keeps starting from zoomed in and then ...,zoomed,0
3,hey marilyn thanks for your answer the soc2 ty...,Security,1
4,@delanovc @zoom @airtable @notionhq @calendly ...,apple,1


In [62]:
data1.to_csv(r'../data/results/test.csv')