import pandas as pd
import numpy as np
import collections
from sklearn.feature_extraction import text

## Loading the DataSet

In [7]:
data = pd.read_csv("Evaluation-dataset.csv", header=None,names=['text', '1','2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14'])
data.head()

Unnamed: 0,text,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Tires where delivered to the garage of my choi...,garage service positive,ease of booking positive,,,,,,,,,,,,
1,"Easy Tyre Selection Process, Competitive Prici...",garage service positive,value for money positive,,,,,,,,,,,,
2,Very easy to use and good value for money.,value for money positive,,,,,,,,,,,,,
3,Really easy and convenient to arrange,ease of booking positive,,,,,,,,,,,,,
4,It was so easy to select tyre sizes and arrang...,location positive,value for money positive,ease of booking positive,,,,,,,,,,,


In [8]:
data.describe()


Unnamed: 0,text,1,2,3,4,5,6,7,8,9,10,11,12,13,14
count,10132,8129,4239,1538,445,128,36,13,8,4,1,1,1,1,1
unique,10128,84,65,52,50,37,20,9,6,2,1,1,1,1,1
top,Very competitive pricing,value for money positive,value for money positive,value for money positive,value for money positive,garage service positive,value for money positive,booking confusion negative,value for money positive,garage service positive,incorrect tyres sent positive,call wait time positive,refund positive,no stock positive,balancing positive
freq,2,3334,1113,253,58,12,7,2,3,2,1,1,1,1,1


In [9]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
text        0
1        2003
2        5893
3        8594
4        9687
5       10004
6       10096
7       10119
8       10124
9       10128
10      10131
11      10131
12      10131
13      10131
14      10131
dtype: int64


In [11]:
sent_count = {}

for i in data.columns:
    if i!='text':
        z = data[i].dropna()
        for l in z:
            if l not in sent_count.keys():
                sent_count[l] = 1
            else:
                sent_count[l]+=1

In [None]:
sent_count


### Creating a list to store the value counts of each subtheme sentence

In [13]:
# Sort the dictionary by values in descending order
sorted_items = sorted(sent_count.items(), key=lambda x: x[1], reverse=True)

# Create a list of keys where the corresponding values are greater than 30
filtered_keys = [key for key, value in sorted_items if value > 30]

filtered_keys

['value for money positive',
 'garage service positive',
 'ease of booking positive',
 'location positive',
 'length of fitting positive',
 'delivery punctuality positive',
 'tyre quality positive',
 'garage service negative',
 'change of date negative',
 'wait time positive',
 'delivery punctuality negative',
 'advisoragent service positive',
 'ease of booking negative',
 'mobile fitter positive',
 'advisor/agent service positive',
 'value for money negative',
 'wait time negative',
 'damage negative',
 'advisoragent service negative',
 'booking confusion negative',
 'discounts positive',
 'length of fitting negative',
 'extra charges positive',
 'response time negative',
 'late notice negative',
 'incorrect tyres sent negative',
 'advisor/agent service negative',
 'extra charges negative',
 'change of time negative',
 'no stock negative',
 'tyre quality negative',
 'response time positive',
 'facilities positive']

In [14]:
len(filtered_keys)

33

#### Make a function that builds a table with subthemes as columns. The values in each column are determined by the original order of sentences in the subtheme columns.

In [19]:
import pandas as pd

def modify_data(input_data):
    # Extract subtheme columns from the original dataframe
    subtheme_data = input_data.iloc[:, 1:]
    
    # Create a new dataframe with subtheme columns
    modified_data = pd.DataFrame(index=range(0, subtheme_data.shape[0]), columns=filtered_keys)
    
    # Iterate through each row in the original dataframe
    for index, row in subtheme_data.iterrows():
        # Extract non-null subthemes from the row
        subthemes_list = row.dropna().tolist()
        
        # Map the presence of subthemes to the new dataframe
        for subtheme in subthemes_list:
            modified_data.loc[index][subtheme] = 1
    
    # Fill NaN values with 0
    modified_data = modified_data.fillna(0)
    
    # Create the final dataframe with 'Sentence' column and mapped subthemes
    final_data = pd.DataFrame()
    final_data['text'] = input_data['text']
    
    # Copy subtheme columns to the final dataframe
    for column in modified_data.columns:
        final_data[column] = modified_data[column]
    
    return final_data


In [20]:
final_data = modify_data(data)

In [21]:
final_data.head()


Unnamed: 0,text,value for money positive,garage service positive,ease of booking positive,location positive,length of fitting positive,delivery punctuality positive,tyre quality positive,garage service negative,change of date negative,...,response time negative,late notice negative,incorrect tyres sent negative,advisor/agent service negative,extra charges negative,change of time negative,no stock negative,tyre quality negative,response time positive,facilities positive
0,Tires where delivered to the garage of my choi...,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Easy Tyre Selection Process, Competitive Prici...",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Very easy to use and good value for money.,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Really easy and convenient to arrange,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,It was so easy to select tyre sizes and arrang...,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
import string
import re
from collections import Counter
X = final_data['text'].tolist()
X[:2]

['Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.',
 'Easy Tyre Selection Process, Competitive Pricing and Excellent Fitting Service']

## Pre Processing
   #### - Eliminate punctuation and numerical characters.
   #### - Convert all words to lowercase.
   #### - Exclude common stop words (e.g., the, a, that, this, it, etc.).
   #### - Get rid of emojis.
   #### - Remove words occurring rarely (less than or equal to 10 times).

In [23]:

from collections import Counter
from nltk.corpus import stopwords

# Function to remove specified elements from a string
def full_remove(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

# Remove digits
remove_digits = [full_remove(x, list(map(str, range(10)))) for x in X]

# Remove punctuation
remove_punc = [full_remove(x, list(string.punctuation)) for x in remove_digits]

# Convert to lowercase and remove whitespace
sents_lower = [x.lower().strip() for x in remove_punc]

# Function to remove emojis from a string
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Remove stop words
stops = stopwords.words("english")
sents_processed = [' '.join([word for word in sentence.split() if word not in stops]) for sentence in sents_lower]

# Counter for word frequencies
cnt = Counter()

# Remove rare words
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
sents_processed_3 = [' '.join([word for word in sentence.split() if word not in RAREWORDS]) for sentence in sents_processed]

sents_processed_3[:2]


['tires delivered garage choice garage notified delivered day time arranged garage went fitted hassel free experience',
 'easy tyre selection process competitive pricing excellent fitting service']

In [24]:
sent_processed = pd.DataFrame(sents_processed_3, columns=['Sent_Processed'])
d2 = final_data.iloc[:,1:]
final_data = pd.concat([sent_processed, d2], axis=1)
final_data.head()

Unnamed: 0,Sent_Processed,value for money positive,garage service positive,ease of booking positive,location positive,length of fitting positive,delivery punctuality positive,tyre quality positive,garage service negative,change of date negative,...,response time negative,late notice negative,incorrect tyres sent negative,advisor/agent service negative,extra charges negative,change of time negative,no stock negative,tyre quality negative,response time positive,facilities positive
0,tires delivered garage choice garage notified ...,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,easy tyre selection process competitive pricin...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,easy use good value money,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,really easy convenient arrange,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,easy select tyre sizes arrange local fitting p...,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Vectorizing the words

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
vect = CountVectorizer(max_df=1.0,stop_words='english')

In [26]:
X_train,X_test,y_train,y_test = train_test_split(final_data['Sent_Processed'],final_data[final_data.columns[1:]],test_size=0.2)
X_train_vec = vect.fit_transform(X_train)
X_test_vec = vect.transform(X_test)

In [28]:
print(X_train_vec.shape)
print(y_train.shape)
print(X_test_vec.shape)
print(y_test.shape)

(8105, 5552)
(8105, 33)
(2027, 5552)
(2027, 33)


## Model Preparation

In [29]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_vec, y_train)

In [30]:
y_pred_class = nb_classif.predict(X_test_vec)

## Evaluting the Models

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score,recall_score,precision_score,f1_score

In [32]:
ac = accuracy_score(y_test,y_pred_class)
f1 = f1_score(y_test,y_pred_class,average='micro')
print('Accuracy Score : ', ac)
print('F1-Score : ', f1)

Accuracy Score :  0.351258016773557
F1-Score :  0.5316402116402116


In [33]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred_class))

              precision    recall  f1-score   support

           0       0.67      0.93      0.78       909
           1       0.65      0.28      0.39       392
           2       0.73      0.35      0.48       235
           3       0.81      0.36      0.50       206
           4       0.53      0.07      0.12       117
           5       0.25      0.01      0.02        80
           6       0.60      0.03      0.06        91
           7       0.56      0.42      0.48        86
           8       0.46      0.60      0.52        57
           9       0.17      0.02      0.04        49
          10       0.51      0.53      0.52        53
          11       0.00      0.00      0.00        46
          12       0.48      0.22      0.30        54
          13       0.00      0.00      0.00        55
          14       0.25      0.02      0.04        51
          15       0.00      0.00      0.00        25
          16       0.22      0.10      0.13        21
          17       0.65    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
