# 1. Load Data

In [1]:
import pandas as pd

In [2]:
train_questions = pd.read_csv('./data/questions_2009.csv')

In [3]:
print(train_questions.shape)

(34279, 8)


In [4]:
train_questions.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,5826,404430,23571.0,2009-01-01T02:39:17Z,,8,What have you used Regular Expressions for?,<p>I have heard of regular expressions and onl...
1,5827,404450,,2009-01-01T02:55:13Z,,1,LINQ to SQL,<p>I am finishing off a C# ASP.NET program tha...
2,5828,404470,24457.0,2009-01-01T03:17:22Z,,55,What MIME type if JSON is being returned by a ...,<p>My REST API returns JSON. </p>\n\n<p>I'm cu...
3,5829,404600,4.0,2009-01-01T05:51:41Z,,4,SQL Server Enterprise Manager 2005 - stored pr...,<p>When using SQL Server Management Studio fro...
4,5830,404830,2594.0,2009-01-01T10:31:23Z,,2,Cocoa Won't Capture Shift Modifier?,"<p>Hey All,\nI have an application in which I'..."


In [5]:
q_cols = ['Id', 'Score', 'Title', 'Body']
train_Q = train_questions[q_cols]
train_Q.shape

(34279, 4)

# 2. Clean text and create TF-IDF model using "Title"

## 2.1. Clean text

In [6]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
headlines=[]
for index, row in train_Q.iterrows():
    headlines.append(row['Title'])
len(headlines)

34279

In [8]:
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus[0]

'use regular express'

## 2.2. Creating the TF-IDF model

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
train_QT = cv.fit_transform(corpus).toarray()

In [11]:
train_QT.shape

(34279, 10000)

In [12]:
train_QT

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
df1 = pd.DataFrame(train_QT, columns=cv.get_feature_names())
df1.shape

(34279, 10000)

In [14]:
df = train_questions.drop(['Unnamed: 0', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Title', 'Body', 'Score'], axis=1)

In [15]:
res = pd.concat([df, df1], axis=1)
res.shape

(34279, 10001)

In [17]:
res.to_csv('data/data.csv', index=False)