In [120]:
import pandas as pd
import numpy as np
import random as rand

import re
import string
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.pipeline import Pipeline

### Getting Data
For the SVD, we need a corpus comprising a list of questions, since we are using just the questions (not the answers) to make recommendations.

In [121]:
qa_data = pd.read_csv("Q&A.csv", index_col="Questions")
qa_data.index = qa_data.index.str.lower() # all questions lowercase for consistency
qa_corpus = list(qa_data.index)
qa_corpus

['what is one thing you always wanted as a kid, but never got?',
 'if you could bring someone famous back from the grave, who would you choose?',
 'where do you not mind waiting?',
 'if you could lock up one person in a mental institution, who would it be?',
 'if you could project yourself into the past, where would you go?',
 'what would you refuse to do for one million dollars?',
 'are you a good swimmer?',
 'are you left or right handed?',
 'how old are you?',
 'what football team do you support?',
 'do you correct peoples mistakes?',
 'if you and a friend both wanted the same thing would you let the friend get it first?',
 'if you saw someone drop a £10 note, would you claim it for your own or try to return it to them?',
 'if you met a genie who offered you three wishes, what would you wish for? (more wishes does not count)',
 'how many languages do you speak?',
 'have you ever brought a present for someone that they hated/disliked?',
 'are you a good comedian?',
 'if you were prim

### Data Preprocessing
Luckily, we don't have to change too much since the questions are already in the right format. However, we still want to simplify the information into the bare necessities.

In [143]:
# list of words that don't provide much information
stop_words = stopwords.words('english')
keep_list = ["who", "what", "when", "where", "why", "how", "which", "name"]

for w in keep_list:
    if w in stop_words:
        print('removed from stopwords:', w)
        stop_words.remove(w)

removed from stopwords: who
removed from stopwords: what
removed from stopwords: when
removed from stopwords: where
removed from stopwords: why
removed from stopwords: how
removed from stopwords: which


In [140]:
tf = TfidfVectorizer(lowercase=True, 
                     token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                     stop_words=stop_words,
                     min_df=1)
X = tf.fit_transform(qa_corpus)
lsa = TruncatedSVD(n_components=qa_data.shape[1]-1)
lsa.fit(X)
var_ratio = lsa.explained_variance_ratio_

def select_n_components(var_ratio, goal_var: float) -> int:
    # set initial variance explained so far
    total_variance = 0.0
    
    # set initial number of features
    n_components = 0
    
    # for the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # add the explained variance to the total
        total_variance += explained_variance
        
        # add one to the number of components
        n_components += 1
        
        # if we reach our goal level of explained variance
        if total_variance >= goal_var:
            # end the loop
            break
            
    # return the number of components
    return n_components

n_components = select_n_components(var_ratio, 0.95)
print(n_components)

17


In [141]:
def make_df(corpus):
    # pipeline that uses a TF-IDF Vectorizer to remove punctuation, useless words, and assign value from word rarity
    pipe = [('tfidf', TfidfVectorizer(stop_words=stop_words, 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=1)),
                        ('lsa', TruncatedSVD(n_components)), # squishes the SVD down to two dimensions
                        ('normalizer', Normalizer())] # scales 0-1 to make the similarity more clear
    
    pipeline = Pipeline(pipe)
    # SVD matrix will be used by our recommendation function (like with movie recommendation)
    dtm_svd = pipeline.fit_transform(corpus)

    return pd.DataFrame(dtm_svd.round(10),
                 index=corpus)

make_df(qa_corpus)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
"what is one thing you always wanted as a kid, but never got?",0.431135,-0.00356,0.185979,0.187192,-0.21822,-0.071114,0.334646,0.125008,-0.376574,-0.337827,0.317892,-0.028188,-0.172794,0.310608,0.250978,-0.077006,0.105585
"if you could bring someone famous back from the grave, who would you choose?",0.158931,0.038301,0.618049,0.056631,-0.270894,-0.028443,-0.005363,0.045081,0.325121,0.036842,-0.000834,0.117037,0.251606,-0.209975,-0.464065,-0.211521,-0.153551
where do you not mind waiting?,0.009156,0.008834,0.116092,0.020213,-0.07144,-0.005287,-0.164644,-0.064373,0.232702,0.243509,-0.005609,0.562962,0.065581,0.062014,0.687362,-0.093772,0.172508
"if you could lock up one person in a mental institution, who would it be?",0.218391,0.044513,0.770854,0.036362,-0.181959,0.360445,-0.210539,-0.05777,0.06865,-0.169292,0.134698,-0.018033,0.010894,0.056242,-0.189144,0.160183,-0.165941
"if you could project yourself into the past, where would you go?",0.091763,0.074916,0.51278,0.037548,-0.186199,-0.003177,-0.258274,-0.0537,0.33872,0.332723,-0.003592,0.465652,0.01583,0.104647,0.399244,-0.055372,0.037429
what would you refuse to do for one million dollars?,0.604026,0.0033,0.491941,0.040489,-0.218915,-0.058679,-0.179759,-0.122095,-0.267951,-0.213429,-0.083407,-0.321272,-0.091693,0.026694,0.134739,0.140572,0.135468
are you a good swimmer?,0.015122,0.001386,0.179789,-0.123914,0.63247,-0.53425,-0.13296,-0.080694,0.091745,-0.408203,0.170995,0.144039,-0.007948,0.012279,0.047494,0.104927,-0.062822
are you left or right handed?,0.011866,-0.001039,-0.019159,0.898261,0.223953,0.024849,-0.089556,-0.01265,0.058578,0.025241,0.046895,0.039579,-0.121425,0.170976,-0.161461,-0.193789,-0.135024
how old are you?,0.011772,0.963576,-0.055928,-0.003984,0.020988,-0.007682,0.01677,0.000624,-0.010994,-0.032078,-0.007746,0.019432,-0.208726,-0.139995,-0.049633,0.003109,0.015229
what football team do you support?,0.656386,-0.016509,-0.217131,-0.039995,0.018344,-0.039462,0.012165,-0.049493,-0.007506,-0.014175,-0.082381,0.058465,-0.098066,-0.259772,0.258307,-0.530033,-0.283602


### Recommendation and Chatbot
Now that everything is set up, we can get to the code for the actual question recommender and have a user input questions and receive a response.

In [145]:
def get_similar_sentences(compare_sentence, corpus, num_recom):
    # using a temp corpus so as not to alter the original
    temp_corpus = [] + corpus
    
    # if the question being asked is already in our database, we don't need it twice
    if compare_sentence not in temp_corpus:
        temp_corpus.append(compare_sentence)
    
    df = make_df(temp_corpus) # now we can make the SVD dataframe
    temp_corpus.clear()
    comp = df.index.get_loc(compare_sentence) # index of our question
    
    recs = []
    for sentence in range(df.shape[0]):
        if sentence != comp:
            # dot product of our question and another sentence to measure their semantic similarity
            recs.append((np.dot(df.iloc[comp], df.iloc[sentence]), sentence))
            
    # sorting questions by most -> least similar so we can respond with the "closest" answer
    recs.sort(reverse = True)
    final_rec = [df.iloc[recs[i][1]].name for i in range(num_recom)]
    return final_rec

def bot():
    # get user input
    q = input("Hello, my name is AskTwitterBot.\n\nWhat question do you have?").lower().strip()
    while q:
        # find the closest question
        closest_q = get_similar_sentences(q, qa_corpus, 1)[0]
        print(f"\nClosest to '{q}':", closest_q)
        
        # randomly select a response from the possible responses
        ans = qa_data.loc[closest_q].dropna()
        x = rand.randint(0, len(ans) - 1)
        print(ans.values[x])
        
        q = input("Any other questions? (enter if not)").lower().strip()

bot()

Hello, my name is AskTwitterBot.

What question do you have?Where do you live?

Closest to 'where do you live?': where do you not mind waiting?
Santa Barbara, California
Any other questions? (enter if not)What's the capital of Australia?

Closest to 'what's the capital of australia?': what are some of your bad habits?
Not thinking things through .
Any other questions? (enter if not)How do you do?

Closest to 'how do you do?': how are you?
tired 😂
Any other questions? (enter if not)What's your name?

Closest to 'what's your name?': what do you do?
I’m a student.
Any other questions? (enter if not)Who is it?

Closest to 'who is it?': if you met a genie who offered you three wishes, what would you wish for? (more wishes does not count)
To call Allie cat again


KeyboardInterrupt: Interrupted by user