# Data cleaning

In this step we will load the dataset and perform a basic cleaning in order to simplify our futher steps.



In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")


# path_train : location of test file
# Code starts here

#Loading data
df = pd.read_csv("train_data.csv",index_col="MID")
print(df.head())

#Code ends here


                                            message   category
MID                                                           
0                                      7am everyday  reminders
1                                    chocolate cake       food
2    closed mortice and tenon joint door dimentions    support
3                               train eppo kelambum     travel
4      yesterday i have cancelled the flight ticket     travel


# Data Processing

As we have seen in the Text Analytics concepts we need to convert this textual data into vectors so that we can apply machine learning algorithms to them. In this task we will now employ a normal TF-IDF vectorizer to vectorize the message column and label encode the category column, essentially making it a classification problem. 


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Sampling only 1000 samples of each category
df = df.groupby('category').apply(lambda x: x.sample(n=1000, random_state=0))

# Code starts here

# Converting all messages to lower case and storing it
all_text = df["message"].str.lower()

# Initialising TF-IDF object
tfidf = TfidfVectorizer(stop_words="english")

# Vectorizing data
tfidf.fit(all_text)

# Storing the TF-IDF vectorized data into an array
X = tfidf.transform(all_text).toarray()

# Initiating a label encoder object
le = LabelEncoder()

# Fitting the label encoder object on the data
le.fit(df["category"])

# Transforming the data and storing it
y = le.transform(df["category"])

# Classification implementation

In the previous tasks we have cleaned the data and converted the textual data into numbers in order to enable us to apply machine learning models. In this task we will apply Logistic Regression , Naive Bayes and Lienar SVM model onto the data.



In [3]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Code starts here

# Splitting the data into train and test sets
X_train, X_val,y_train, y_val = train_test_split(X,y, test_size = 0.3, random_state = 42)

# Implementing Logistic Regression model
log_reg = LogisticRegression(random_state=0)
log_reg.fit(X_train,y_train)
y_pred = log_reg.predict(X_val)
log_f1 = f1_score(y_val,y_pred, average='macro')
print (str(log_f1)+(" is the f1 score of the logistic regression model"))

# Implementing Multinomial NB model
nb = MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_val)
nb_f1 = f1_score(y_val,y_pred,average='macro')
print (str(nb_f1)+(" is the f1 score of the Naive Bayes model"))


# Implementing Linear SVM model
lsvm = LinearSVC(random_state=0)
lsvm.fit(X_train, y_train)
y_pred = lsvm.predict(X_val)
lsvm_f1 = f1_score(y_val,y_pred,average='macro')
print (str(lsvm_f1)+(" is the f1 of the LinearSVC model"))

0.7121925559391143 is the f1 score of the logistic regression model
0.7098202248522268 is the f1 score of the Naive Bayes model
0.7167828373922656 is the f1 of the LinearSVC model


Best score is given by LinearSVM model

# Validation of test data

Let's now see how well our models run on test set.


In [4]:

# path_test : Location of test data

#Loading the dataframe
df_test = pd.read_csv("test_data.csv")

# Code starts here

all_text = df_test["message"].str.lower()

# Transforming using the tfidf object - tfidf
X_test = tfidf.transform(all_text).toarray()



# Predicting using the linear svm model - lsvm

submission=pd.DataFrame(lsvm.predict(X_test))

# Create the submission file
submission.to_csv('submission_domain.csv',index=False,header=['category'])

# BONUS ACTIVITY

# LSI Modeling

Let's now try to attempt topic modelling on our dataset.
Let's use LSI for the same.


### LSA Modeling

In this task, we will try to see how to use LSI on the entire dataset.


In [5]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim.models.lsimodel import LsiModel
from gensim import corpora
from pprint import pprint
# import nltk
# nltk.download('wordnet')

# Creating a stopwords list
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

# Function to lemmatize and remove the stopwords
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()

# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]

# Code starts here

# Creating the dictionary id2word from our cleaned word list doc_clean
dictionary = corpora.Dictionary(doc_clean)

# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]


# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

# Code ends here

[(0,
  '0.347*"reminder" + 0.267*"like" + 0.267*"cancel" + 0.266*"would" + '
  '0.256*"userid" + 0.256*"apiname" + 0.256*"exotel" + 0.256*"offset" + '
  '0.255*"taskname" + 0.255*"reminderlist"'),
 (1,
  '-0.831*"want" + -0.221*"u" + -0.187*"know" + -0.181*"movie" + -0.135*"book" '
  '+ -0.128*"ticket" + -0.114*"need" + -0.108*"hi" + -0.096*"please" + '
  '-0.092*"service"'),
 (2,
  '0.451*"reminder" + -0.328*"call" + -0.316*"u" + -0.233*"wake" + '
  '0.205*"water" + -0.197*"march" + -0.192*"wakeup" + 0.185*"every" + '
  '0.181*"drink" + 0.168*"want"'),
 (3,
  '0.611*"u" + -0.418*"want" + 0.244*"need" + 0.238*"reminder" + '
  '0.197*"please" + 0.143*"movie" + 0.117*"service" + -0.102*"wake" + '
  '0.101*"near" + 0.101*"help"'),
 (4,
  '0.622*"need" + -0.510*"u" + 0.491*"movie" + 0.189*"offer" + -0.137*"want" + '
  '0.115*"ticket" + 0.058*"know" + -0.052*"find" + 0.051*"today" + '
  '0.049*"book"')]


# LDA Modeling

Next let's try to do topic modeling using LDA. We will first find the optimum no. of topics using coherence score and then create a model attaining to the optimum no. of topics.



In [6]:
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task

# Function to calculate coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    topic_list : No. of topics chosen
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    topic_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(doc_term_matrix, random_state = 0, num_topics=num_topics, id2word = dictionary, iterations=10)
        topic_list.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values


# Code starts here

# Calling the function
topic_list, coherence_value_list = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5)
print(coherence_value_list)
# Finding the index associated with maximum coherence value
max_index=coherence_value_list.index(max(coherence_value_list))

# Finding the optimum no. of topics associated with the maximum coherence value
opt_topic= topic_list[max_index]
print("Optimum no. of topics:", opt_topic)

# Implementing LDA with the optimum no. of topic
lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word = dictionary, iterations=10, passes = 30,random_state=0)

# pprint(lda_model.print_topics(5))
lda_model.print_topic(1)


[0.3287476298674388, 0.4801812391625579, 0.5306698259321219, 0.5376618801954907, 0.5587078765648961, 0.572049572781549, 0.5663902769474314, 0.5892433863373899]
Optimum no. of topics: 36


'0.167*"2" + 0.101*"mobile" + 0.053*"discount" + 0.035*"already" + 0.027*"information" + 0.018*"table" + 0.016*"want" + 0.014*"cheap" + 0.014*"black" + 0.013*"kid"'