In [27]:
import pandas as pd
import numpy as np
import random as rand

import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords

### Getting Data
For the SVD, we need a corpus comprising a list of questions, since we are using just the questions (not the answers) to make recommendations.

In [72]:
qa_data = pd.read_csv("Q&A.csv", index_col="Questions")
qa_data.index = qa_data.index.str.lower() # all questions lowercase for consistency
qa_corpus = list(qa_data.index)
qa_corpus

['what is one thing you always wanted as a kid, but never got?',
 'if you could bring someone famous back from the grave, who would you choose?',
 'where do you not mind waiting?',
 'if you could lock up one person in a mental institution, who would it be?',
 'if you could project yourself into the past, where would you go?',
 'what would you refuse to do for one million dollars?',
 'are you a good swimmer?',
 'are you left or right handed?',
 'how old are you?',
 'what football team do you support?',
 'do you correct peoples mistakes?',
 'if you and a friend both wanted the same thing would you let the friend get it first?',
 'if you saw someone drop a £10 note, would you claim it for your own or try to return it to them?',
 'if you met a genie who offered you three wishes, what would you wish for? (more wishes does not count)',
 'how many languages do you speak?',
 'have you ever brought a present for someone that they hated/disliked?',
 'are you a good comedian?',
 'if you were prim

### Data Preprocessing
Luckily, we don't have to change too much since the questions are already in the right format. However, we still want to simplify the information into the bare necessities.

In [None]:
# list of words that don't provide much information
stop_words = stopwords.words('english')

def make_df(corpus):
    # pipeline that uses the Tf-Idf method to remove useless words and categorize the questions by semantic similarity
    pipe = [('tfidf', TfidfVectorizer(stop_words=stop_words, 
                        token_pattern="\\b[a-zA-Z][a-zA-Z]+\\b", 
                        min_df=1)),
       ('lsa', TruncatedSVD(2)),
       ('normalizer', Normalizer())] # each value is either 0 or 1 to make the similarity more clear
    
    # then creates an SVD dataframe of the specified number of components, to be used by our recommendation system
    # (like with movie recommendations)
    pipeline = Pipeline(pipe)
    dtm_svd = pipeline.fit_transform(corpus)
    #dtm_svd = Normalizer(copy=False).fit_transform(X_svd) ????

    return pd.DataFrame(dtm_svd.round(10),
                 index=corpus,
                 columns=["component_1","component_2" ])

make_df(qa_corpus)

### Recommendation and Chatbot
Now that everything is set up, we can get to the code for the actual question recommender and have a user input questions and receive a response.

In [77]:
def get_similar_sentences(compare_sentence, corpus, num_recom):
    # using a temp corpus so as not to alter the original
    temp_corpus = [] + corpus
    
    # if the question being asked is already in our database, we don't need it twice
    if compare_sentence not in temp_corpus:
        temp_corpus.append(compare_sentence)
    
    df = make_df(temp_corpus) # now we can make the SVD dataframe
    temp_corpus.clear()
    comp = df.index.get_loc(compare_sentence) # index of our question
    
    recs = []
    for sentence in range(df.shape[0]):
        if sentence != comp:
            # dot product of our question and another sentence to measure their semantic similarity
            recs.append((np.dot(df.iloc[comp], df.iloc[sentence]), sentence))
            
    # sorting questions by most -> least similar so we can respond with the "closest" answer
    recs.sort(reverse = True)
    final_rec = [df.iloc[recs[i][1]].name for i in range(num_recom)]
    return final_rec

def bot():
    # get user input
    q = input("Hello, my name is AskTwitterBot.\n\nWhat question do you have?").lower().strip()
    while q:
        # find the closest question
        closest_q = get_similar_sentences(q, qa_corpus, 1)[0]
        print(f"\nClosest to '{q}':", closest_q)
        
        # randomly select a response from the possible responses
        ans = qa_data.loc[closest_q].dropna()
        x = rand.randint(0, len(ans) - 1)
        print(ans.values[x])
        
        q = input("Any other questions? (enter if not)").lower().strip()

bot()

Hello, my name is AskTwitterBot.

What question do you have?what are you?

Closest to 'what are you?': what is one talent you have? / one that you wish that you had?
Umm ... I can touch my nose with my tounge ? And I'm double jointed . 🤷🏻‍♀️
Any other questions? (enter if not)good job you fool

Closest to 'good job you fool': are you a good swimmer?
No 😔
Any other questions? (enter if not)how can i get to australia?

Closest to 'how can i get to australia?': how old are you?
10
Any other questions? (enter if not)how young are uo?

Closest to 'how young are uo?': how many languages do you speak?
english and im getting p good at french. i want to learn dutch, german, and itallian as well
Any other questions? (enter if not)how young are you

Closest to 'how young are you': how old are you?
15
Any other questions? (enter if not)how many years of age do you have

Closest to 'how many years of age do you have': how many languages do you speak?
Only. English
Any other questions? (enter if not