# 1. Load Data

In [1]:
import pandas as pd

In [18]:
train_questions = pd.read_csv('./data/questions_2009.csv')
test_questions = pd.read_csv('./data/questions_2008.csv')

In [3]:
print(train_questions.shape)

(34279, 8)


In [4]:
train_questions.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,5826,404430,23571.0,2009-01-01T02:39:17Z,,8,What have you used Regular Expressions for?,<p>I have heard of regular expressions and onl...
1,5827,404450,,2009-01-01T02:55:13Z,,1,LINQ to SQL,<p>I am finishing off a C# ASP.NET program tha...
2,5828,404470,24457.0,2009-01-01T03:17:22Z,,55,What MIME type if JSON is being returned by a ...,<p>My REST API returns JSON. </p>\n\n<p>I'm cu...
3,5829,404600,4.0,2009-01-01T05:51:41Z,,4,SQL Server Enterprise Manager 2005 - stored pr...,<p>When using SQL Server Management Studio fro...
4,5830,404830,2594.0,2009-01-01T10:31:23Z,,2,Cocoa Won't Capture Shift Modifier?,"<p>Hey All,\nI have an application in which I'..."


In [5]:
q_cols = ['Id', 'Score', 'Title', 'Body']
train_Q = train_questions[q_cols]
train_Q.shape

(34279, 4)

# 2. Clean text and create TF-IDF model using "Title"

## 2.1. Clean text

In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
headlines=[]
for index, row in train_Q.iterrows():
    headlines.append(row['Title'])
len(headlines)

34279

In [8]:
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[0]

'use regular express'

## 2.2. Creating the TF-IDF model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000, ngram_range=(2, 2))
train_QT = cv.fit_transform(corpus).toarray()

In [11]:
train_QT.shape

(34279, 5000)

In [12]:
train_QT

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
df1 = pd.DataFrame(train_QT, columns=cv.get_feature_names())
df1.shape

(34279, 5000)

In [14]:
df = train_questions.drop(['Unnamed: 0', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Title', 'Body', 'Score'], axis=1)

In [15]:
res = pd.concat([df, df1], axis=1)
res.shape

(34279, 5001)

In [16]:
res = res.set_index('Id')

In [17]:
res.head(4)

Unnamed: 0_level_0,abl use,absolut path,abstract class,accept nest,access asp,access control,access data,access databas,access db,access deni,...,xp vista,xsd schema,xsl templat,youtub video,zend db,zend form,zend framework,zend search,zip archiv,zip file
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Similarity

In [19]:
from sklearn.neighbors import NearestNeighbors


model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(res)

NearestNeighbors(algorithm='brute', metric='cosine')

In [51]:
def getSimilarQuestions(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
    text = ' '.join(text)
    corpus = [text]
    test_question = cv.transform(corpus).toarray()
    test_question = pd.DataFrame(test_question, columns=cv.get_feature_names())
    distances, indices = model.kneighbors(test_question.values.reshape(1, -1), n_neighbors = 6)
    indices = indices.flatten()
    similar_questions = []
    for indice in indices:
        similar_questions.append(train_questions.loc[indice])
    return similar_questions

In [91]:
def printSimilarQuestionsList(similars, question):
    print('Questions similar to: ', question.Title, '\n')
    for similar in similars:
        print(similar.Title)

In [92]:
question = test_questions.loc[1000]
question

Unnamed: 0                                                   1000
Id                                                          83840
OwnerUserId                                               14113.0
CreationDate                                 2008-09-17T14:30:24Z
ClosedDate                                                    NaN
Score                                                           9
Title           Is there a Functional Programming library for ...
Body            <p>For example, in Java there is <a href="http...
Name: 1000, dtype: object

In [93]:
text = question.Title + ' ' + question.Body

In [94]:
similars = getSimilarQuestions(text)
len(similars)

6

In [95]:
printSimilarQuestionsList(similars, question)

Questions similar to:  Is there a Functional Programming library for .NET? 

Matrix Library for .NET
Data import wizard library for .Net?
Media File Conversion Library For .NET
SFTP Libraries for .NET
Writing functional programs in non-functional languages
Functional Programming in C++


In [97]:
questions = test_questions.sample(n=5)
questions.Title

3404                 Scanning Java annotations at runtime
5534    What is the best library for Java to grid/clus...
1756    Algorithm to estimate number of English transl...
5743      Calculating the semantic distance between words
5138    LinkageError: loader constraints violated when...
Name: Title, dtype: object

In [98]:
for index, question in questions.iterrows():
    text = question.Title + ' ' + question.Body
    similars = getSimilarQuestions(text)
    printSimilarQuestionsList(similars, question)
    print('\n\n***************************\n\n')

Questions similar to:  Scanning Java annotations at runtime 

web applications -- where to start?
Best way to search for R packages?
What's the best way to search GitHub?
How to determine what classes have been loaded from where in JBoss
log4j - trigger log rolling when application starts
creating something like pageflakes


***************************


Questions similar to:  What is the best library for Java to grid/cluster-enable your application? 

Monitioring running application
Email Template Library in Java
How to sign auto run application in blackberry
Is it possible to configure log4j to create a new file with every run of the application?
running application on client machine
Which SOAP XML object serialization library for Java would you recommend?


***************************


Questions similar to:  Algorithm to estimate number of English translation words from Japanese source 

Count number of points inside a circle fast
Counting the number of common chars in a string and