# 1. Load Data

In [1]:
import pandas as pd

In [2]:
train_questions = pd.read_csv('./data/questions_2009.csv')
test_questions = pd.read_csv('./data/questions_2008.csv')

In [3]:
print(train_questions.shape)

(34279, 8)


In [4]:
train_questions.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,5826,404430,23571.0,2009-01-01T02:39:17Z,,8,What have you used Regular Expressions for?,<p>I have heard of regular expressions and onl...
1,5827,404450,,2009-01-01T02:55:13Z,,1,LINQ to SQL,<p>I am finishing off a C# ASP.NET program tha...
2,5828,404470,24457.0,2009-01-01T03:17:22Z,,55,What MIME type if JSON is being returned by a ...,<p>My REST API returns JSON. </p>\n\n<p>I'm cu...
3,5829,404600,4.0,2009-01-01T05:51:41Z,,4,SQL Server Enterprise Manager 2005 - stored pr...,<p>When using SQL Server Management Studio fro...
4,5830,404830,2594.0,2009-01-01T10:31:23Z,,2,Cocoa Won't Capture Shift Modifier?,"<p>Hey All,\nI have an application in which I'..."


In [5]:
q_cols = ['Id', 'Score', 'Title', 'Body']
train_Q = train_questions[q_cols]
train_Q.shape

(34279, 4)

# 2. Clean text and create TF-IDF model using "Title"

## 2.1. Clean text

In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
headlines=[]
for index, row in train_Q.iterrows():
    headlines.append(row['Title'])
len(headlines)

34279

In [8]:
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review] # if word not in stopwords.words('english')
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[0]

'what have you use regular express for'

## 2.2. Creating the TF-IDF model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000, ngram_range=(2, 2))
train_QT = cv.fit_transform(corpus).toarray()

In [11]:
train_QT.shape

(34279, 5000)

In [12]:
train_QT

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
df1 = pd.DataFrame(train_QT, columns=cv.get_feature_names())
df1.shape

(34279, 5000)

In [14]:
df = train_questions.drop(['Unnamed: 0', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Title', 'Body', 'Score'], axis=1)

In [15]:
res = pd.concat([df, df1], axis=1)
res.shape

(34279, 5001)

In [16]:
res = res.set_index('Id')

In [17]:
res.head(4)

Unnamed: 0_level_0,abl to,about the,absolut path,abstract class,access an,access control,access data,access databas,access db,access deni,...,you put,you recommend,you set,you use,your favorit,your own,zend db,zend form,zend framework,zip file
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.394512,0.0,0.0,0.0,0.0,0.0,0.0
404450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Similarity

In [18]:
from sklearn.neighbors import NearestNeighbors


model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model.fit(res)

NearestNeighbors(algorithm='brute', metric='cosine')

In [19]:
def getSimilarQuestions(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text] # if word not in stopwords.words('english')
    text = ' '.join(text)
    corpus = [text]
    test_question = cv.transform(corpus).toarray()
    test_question = pd.DataFrame(test_question, columns=cv.get_feature_names())
    distances, indices = model.kneighbors(test_question.values.reshape(1, -1), n_neighbors = 6)
    indices = indices.flatten()
    similar_questions = []
    for indice in indices:
        similar_questions.append(train_questions.loc[indice])
    return similar_questions

In [20]:
def printSimilarQuestionsList(similars, question):
    print('Questions similar to: ', question, '\n')
    for similar in similars:
        print(similar.Title)

In [21]:
question = test_questions.loc[1000]
question

Unnamed: 0                                                   1000
Id                                                          83840
OwnerUserId                                               14113.0
CreationDate                                 2008-09-17T14:30:24Z
ClosedDate                                                    NaN
Score                                                           9
Title           Is there a Functional Programming library for ...
Body            <p>For example, in Java there is <a href="http...
Name: 1000, dtype: object

In [22]:
text = question.Title + ' ' + question.Body

In [23]:
similars = getSimilarQuestions(text)
len(similars)

6

In [24]:
printSimilarQuestionsList(similars, question.Title)

Questions similar to:  Is there a Functional Programming library for .NET? 

SFTP Libraries for .NET
Data import wizard library for .Net?
Matrix Library for .NET
Media File Conversion Library For .NET
Is there a streaming API for JSON?
how to select divs with jquery like for example in photoshop ? (select area)


In [25]:
questions = test_questions.sample(n=5)
questions.Title

2738     Advantages of Antlr (versus say, lex/yacc/bison)
1212    Is there a way to prevent google search terms ...
2251        Python style: multiple-line conditions in IFs
2116    Problem using the ASP.NET FileUpload control i...
3502                         Sensitive data in Viewstate?
Name: Title, dtype: object

In [26]:
for index, question in questions.iterrows():
    text = question.Title + ' ' + question.Body
    similars = getSimilarQuestions(text)
    printSimilarQuestionsList(similars, question.Title)
    print('\n\n***************************\n\n')

Questions similar to:  Advantages of Antlr (versus say, lex/yacc/bison) 

Jqmodal isn't working in the Updatepanel
Android: Simulate WiFi in the emulator?
BC30560: 'default_aspx' is ambiguous in the namespace 'ASP'
Can i give alignment in the picker view's data?
What permissions should Developers have in the Dev database instance
Can I obtain higher resolution in the frequency domain with a stereo signal?


***************************


Questions similar to:  Is there a way to prevent google search terms from matching urls? 

Is there a way to paint semi transparently on a PictureBox?
Is there a way to emulate a QGroupVBoxLayout?
Adding html in DataTextFormatString
Is there a way to get Asio working without Boost?
Is there a way to get which classes a ClassLoader has loaded?
Is there a way to make imshow in octave behave more like Matlab?


***************************


Questions similar to:  Python style: multiple-line conditions in IFs 

How to do something as application load Logo i