In [1]:
#Importing necessary libraries 
import pandas as pd
import re
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from collections import Counter 
from tqdm import tqdm
from nltk.corpus import stopwords

In [2]:
import time

In [3]:
df = pd.read_csv("../Dataset/train - train.csv")

In [4]:
df.head()

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely?,cancelled,1
1,cannot rely on both milk delivery and grocery ...,Milk,0
2,"I get no notification, however the app is real...",notification,0
3,"Love this app, but would love it even more if ...",view,1
4,it does not let me load a clip on the scene,load,0


In [5]:
data =df.copy()

In [6]:
data.tail()

Unnamed: 0,text,aspect,label
3995,every time i try to edit a page or create a li...,tools,0
3996,unable to load money using wallets (phonepe/ol...,Unable to load money using wallets,0
3997,"hi, i m doing for the first time ever and i no...",usage,0
3998,delivery is delayed or cancelled every time i ...,Delivery,0
3999,your customer service is terrible!,customer service,0


In [7]:
#Function for cleaning the data
wnl=WordNetLemmatizer()
def clean_data(in_data):
    corpus=[]
    for i in range(0,len(in_data)):
        review = re.sub('[^a-zA-Z]',' ',in_data[i])
        review = review.lower()
        review = review.split()
        review = [wnl.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [8]:
start_time=time.time()
text_parsed_list = clean_data(data['text'])
print("----%.2f seconds----",(time.time()-start_time))

----%.2f seconds---- 25.69522523880005


In [9]:
text_parsed_list[0]

'check whether cancelled completely'

In [10]:
start_time=time.time()
aspect_parsed_list = clean_data(data['aspect'])
print("----%.2f seconds----",(time.time()-start_time))

----%.2f seconds---- 1.8804693222045898


In [11]:
aspect_parsed_list[0]

'cancelled'

In [12]:
data['text_parsed']=text_parsed_list
data['aspect_parsed']=aspect_parsed_list

In [13]:
data.head()

Unnamed: 0,text,aspect,label,text_parsed,aspect_parsed
0,can you check whether its cancelled completely?,cancelled,1,check whether cancelled completely,cancelled
1,cannot rely on both milk delivery and grocery ...,Milk,0,cannot rely milk delivery grocery item,milk
2,"I get no notification, however the app is real...",notification,0,get notification however app really fine,notification
3,"Love this app, but would love it even more if ...",view,1,love app would love even gantt chart calendar ...,view
4,it does not let me load a clip on the scene,load,0,let load clip scene,load


In [14]:
data.isnull().sum()

text             0
aspect           0
label            0
text_parsed      0
aspect_parsed    0
dtype: int64

In [15]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [27]:
p_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer ()

In [28]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)      # replace won't with "will not"
    phrase = re.sub(r"can\'t", "can not", phrase)      # replace can or cant with 'can not'
    phrase = re.sub(r"n\'t", " not", phrase)           # replece n with 'not'
    phrase = re.sub(r"\'re", " are", phrase)           # replace re with 'are'
    phrase = re.sub(r"\'s", " is", phrase)             # replace s with 'is'
    phrase = re.sub(r"\'d", " would", phrase)          # replace 'd' with 'would'
    phrase = re.sub(r"\'ll", " will", phrase)          # replace 'll with 'will'
    phrase = re.sub(r"\'t", " not", phrase)            # replace 't' with 'not'
    phrase = re.sub(r"\'ve", " have", phrase)          # replace ve with 'have'
    phrase = re.sub(r"\'m", " am", phrase)             # replace 'm with 'am'
    return phrase

  
def preprocess_text(text_data):
    preprocessed_text = []             
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)           #calling funcion for each sentence
        #print("1st sent" , sent)
        sent = sent.replace('\\r', ' ')         # replace line terminator with space
        sent = sent.replace('\\n', ' ')         # replace new line charactor with space
        sent = sent.replace('\\"', ' ')         
        sent = re.sub('[^A-Za-z]+', ' ', sent)  # remove anything that is not letter
        sent = ''.join(p_stemmer.stem(token) for token in sent )
        sent = ''.join(lemmatizer.lemmatize(token) for token in sent )
        sent  = ' '.join(e for e in sent.split() if len( Counter(e)) > 2 )
        #sent = lstr(emmatize_text(sent)
        
        sent = ' '.join(e for e in sent.split() if e.lower() not in 'root/nltk_data/corpora/stop_words') # checking for stop words
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [34]:
data1=df.copy()

In [35]:
data1['text']=preprocess_text(data1['text'])

100%|████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:02<00:00, 1610.06it/s]


In [37]:
data1.head()

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely,cancelled,1
1,cannot rely both milk delivery and grocery items,Milk,0
2,get notification however the really fine,notification,0
3,love this but would love even more gantt chart...,view,1
4,does not let load clip the scene,load,0


In [39]:
data1.shape

(4000, 3)

In [40]:
data1.to_csv('../Dataset/Train.csv')