# 1. Load Data

In [1]:
import pandas as pd

In [20]:
train_questions = pd.read_csv('./data/questions_2009.csv')
test_questions = pd.read_csv('./data/questions_2008.csv')

train_answers = pd.read_csv('./data/answers_2009.csv')
test_answers = pd.read_csv('./data/answers_2008.csv')
tags_data = pd.read_csv('./data/Tags.csv')

In [21]:
print(train_questions.shape)
print(test_questions.shape)
print(train_answers.shape)
print(test_answers.shape)
print(tags_data.shape)

(34279, 8)
(5789, 8)
(112854, 7)
(30136, 7)
(3750994, 2)


In [4]:
train_questions.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,5826,404430,23571.0,2009-01-01T02:39:17Z,,8,What have you used Regular Expressions for?,<p>I have heard of regular expressions and onl...
1,5827,404450,,2009-01-01T02:55:13Z,,1,LINQ to SQL,<p>I am finishing off a C# ASP.NET program tha...
2,5828,404470,24457.0,2009-01-01T03:17:22Z,,55,What MIME type if JSON is being returned by a ...,<p>My REST API returns JSON. </p>\n\n<p>I'm cu...
3,5829,404600,4.0,2009-01-01T05:51:41Z,,4,SQL Server Enterprise Manager 2005 - stored pr...,<p>When using SQL Server Management Studio fro...
4,5830,404830,2594.0,2009-01-01T10:31:23Z,,2,Cocoa Won't Capture Shift Modifier?,"<p>Hey All,\nI have an application in which I'..."


In [5]:
train_answers.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,ParentId,Score,Body
0,21617,404434,1288.0,2009-01-01T02:46:21Z,404430,6,<p>The most common use cases are to find strin...
1,21618,404436,20029.0,2009-01-01T02:46:58Z,404430,1,<p>Stack Overflow is in fact a good place to f...
2,21619,404437,15000.0,2009-01-01T02:47:37Z,404430,0,<p>Validating an email address is something i ...
3,21620,404438,14356.0,2009-01-01T02:48:28Z,404430,12,<p>Many things. Including:</p>\n\n<ul>\n<li>E...
4,21621,404440,38803.0,2009-01-01T02:50:10Z,404430,2,"<p>Well, any time you need to match something ..."


In [6]:
q_cols = ['Id', 'Score', 'Title', 'Body']
train_Q = train_questions[q_cols]
train_Q.shape

(34279, 4)

In [7]:
a_cols = ['Id', 'ParentId', 'Score', 'Body']
train_A = train_answers[a_cols]
train_A.shape

(112854, 4)

# 2. Clean text and create TF-IDF model using "Title"

## 2.1. Clean text

In [8]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [9]:
headlines=[]
for index, row in train_Q.iterrows():
    headlines.append(row['Title'])
len(headlines)

34279

In [10]:
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
corpus[0]

'what have you use regular express for'

## 2.2. Creating the TF-IDF model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
train_QT = cv.fit_transform(corpus).toarray()

In [13]:
train_QT.shape

(34279, 11178)

In [14]:
train_QT

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# 3. Clean text and create TF-IDF model using "Body"

## 3.1. Clean text

In [15]:
headlinesB=[]
for index, row in train_Q.iterrows():
    headlinesB.append(row['Body'])
    """
    sentences = nltk.sent_tokenize(row['Body'])
    for sentence in sentences:
        headlinesB.append(sentence)"""
len(headlinesB)

34279

## 3.2. Create TF-IDF model

In [16]:
psB = PorterStemmer()
corpusB = []
for headline in headlinesB:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpusB.append(review)

In [17]:
corpusB[0]

'p i have heard of regular express and onli seen use case for a few thing so i don t think of use them veri often in the past i have done a coupl of thing and it ha taken me hour to do later i talk to someon and they say here is how to do it use a regular express p p so what are thing for which you have use regular express if i get more exampl then mayb i can begin to know when to look for and use them p'

In [18]:
cvB = TfidfVectorizer(max_features=10000)
train_QB = cvB.fit_transform(corpusB).toarray()

In [19]:
train_QB

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])