### Import Modules and Data

In [233]:
import numpy as np
import pandas as pd
import re

import gensim
from nltk.corpus import stopwords

from gensim.corpora import Dictionary
from gensim import corpora, models

from tqdm import tqdm

In [234]:
df_train = pd.read_csv('../data/train_data.csv')
df_test = pd.read_csv('../data/test_data.csv')
df_test.columns = ['id','question1','question2']
all_qs = pd.concat([df_train,df_test], axis = 0).drop('is_duplicate', axis = 1)

#correction for nas
all_qs['question2'][all_qs['question2'].isnull()] = 'Invalid'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Define Topic Modelling Functions

In [235]:
stops = set(stopwords.words("english"))

In [236]:
def string_to_words(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower().split()
    words = [t for t in text if not t in stops]
    return(words)

In [237]:
def extract_topic_modelling(question_list, topics, threshold):
        
        ids = question_list['id']
        q1_list = question_list['question1']
        q2_list = question_list['question2']
        all_questions = list(q1_list) + list(q2_list)

        #Tokenize and create dictionary (Gensim)
        tok_qs = [string_to_words(q) for q in all_questions]
        dictionary = Dictionary(tok_qs)

        #Filter extreme values
        dictionary.filter_extremes(no_below=1, no_above=0.8)

        #Convert to bagofwords and train on 
        corpus = [dictionary.doc2bow(q) for q in tok_qs]
        LDA = models.LdaModel(corpus, id2word=dictionary, num_topics=topics)
          
        #Q1 Document Topics
        empty_df = np.zeros(shape=(len(q1_list),topics))
        
        cols = []
        for i in range(0, topics):
            cols.append('q1_prob' + str(i))

        q1_df = pd.DataFrame(empty_df, columns=cols)

        for x in tqdm(range(0, len(q1_list))):
            topic_list = LDA.get_document_topics(corpus[x])
            for topic in topic_list:
                top = topic[0]
                prob = topic[1]
                if prob >= threshold:
                    q1_df['q1_prob'+str(top)][x] = 1
                else:
                    q1_df['q1_prob'+str(top)][x] = 0

        #Q2 Document Topics
        empty_df = np.zeros(shape=(len(q2_list),topics))
        
        cols = []
        for i in range(0, topics):
            cols.append('q2_prob' + str(i))

        q2_df = pd.DataFrame(empty_df, columns=cols)

        for x in tqdm(range(len(q1_list), len(corpus))):
            topic_list = LDA.get_document_topics(corpus[x])
            for topic in topic_list:
                top = topic[0]
                prob = topic[1]
                if prob >= threshold:
                    q2_df['q2_prob'+str(top)][x-len(q1_list)] = 1
                else:
                    q2_df['q2_prob'+str(top)][x-len(q1_list)] = 0
        
        sim_colnames = ['similarity_topic'+ str(i) for i in range(0,topics)]
        sim_df = pd.DataFrame(np.zeros(shape=(len(q1_list),topics)), columns = sim_colnames)
        
        for i in range(0,topics):
            sim_df['similarity_topic'+ str(i)] = np.maximum((q1_df['q1_prob' + str(i)] + q2_df['q2_prob' + str(i)]) - 1,0)

                
        dissim_colnames = ['dissimilarity_topic'+ str(i) for i in range(0,topics)]
        dissim_df = pd.DataFrame(np.zeros(shape=(len(q1_list),topics)), columns = dissim_colnames)
        
        for i in range(0,topics):
            dissim_df['dissimilarity_topic'+ str(i)] = (q1_df['q1_prob' + str(i)] + q2_df['q2_prob' + str(i)]) % 2
       
        all_topics = pd.concat([sim_df, dissim_df], axis = 1)
        all_topics = pd.concat([ids.reset_index()['id'],all_topics],axis = 1)
        
        return all_topics

In [238]:
topics = extract_topic_modelling(all_qs,topics = 50, threshold = 0.7)

In [239]:
#topics.to_csv('topic_modelling_output.csv',index = False)

In [242]:
#topics = extract_topic_modelling(all_qs.head(1000),topics = 5, threshold = 0.7)

100%|██████████| 1000/1000 [00:01<00:00, 829.82it/s]
100%|██████████| 1000/1000 [00:01<00:00, 740.36it/s]
