# Quick Regression Model on Hacker News Dataset
- Predicting upvotes by using the bag of words model with post title

In [30]:
#analytics
import pandas as pd
import numpy as np

#random
import random

#Natural language processing: Next we import packages to convert the json data from json file and stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

##### Reading in Our Dataset
The dataset has over 200K post titles with the respective number of likes (or number of points) and the number of comments. Since its not possible to train a dataset this huge on the local computer, we're going to have to subset the data

[dataset](https://www.kaggle.com/hacker-news/hacker-news-posts)

In [2]:
hack_data = pd.read_csv('C:/Users/Darshil/gitly/Deep-Learning/My Projects/Flask-app/project_specific/E2_predict_likes_hackernews/hn_posts.csv')
hack_data[hack_data['num_points']>100].head(10)

Unnamed: 0,title,num_points,num_comments
67,Appropriate Uses for SQLite,125,56
88,UnGoogled Chromium: Chromium with enhanced pri...,251,120
127,Designing and producing 2FA tokens to sell on ...,138,52
158,Finance is Not the Economy,237,115
239,The decline of Stack Overflow (2015),291,255
240,Bidirectional Replication is coming to Postgre...,200,38
275,Bay Area wages soaring but still cant keep up ...,103,166
282,Park.io automating tasks to make $125k per month,342,146
288,Game Genie declassified: That summer I played ...,156,40
295,John Carmack .plan Archive (2014),122,38


#### Before we extract our features we need to subset the data

In [18]:
#for now we'll take the first 1000 and then implement later. Idea: concat 700 above 200 and 300 below

#breakdown - make change here only
thresh = 200
above_200 = 1000
total = 2000

#creating our training data and concating
#pick 700 above above 200
df1 = hack_data[(hack_data['num_points']> thresh)]
df1 = df1.loc[random.sample(list(df1.index),above_200)]

#pick 300 below 200
df2 = hack_data[(hack_data['num_points']<= thresh)]
df2 = df2.loc[random.sample(list(df2.index),total - above_200)]

#concating both dfs into our training dataset
df = pd.concat([df1, df2])
print (df.head() ,'\n \n' ,df.shape)

                                                    title  num_points  \
145665  That Time an SR-71 Made an Emergency Landing i...         276   
88088   Sci-Hub Is Blowing Up the Academic Publishing ...         291   
19198   How a Japanese cucumber farmer is using deep l...         423   
9262                              Why are Adults so busy?         508   
68881   The British are Googling what the E.U. is, hou...         413   

        num_comments  
145665            75  
88088            115  
19198            127  
9262             471  
68881            568   
 
 (2000, 3)


In [34]:
"""st = PorterStemmer()
b = 'i am working'

txt = " ".join([st.stem(w) for w in b.split(' ')])
txt"""

'i am work'

#### Extracting our features

In [49]:
"""
Now we'll apply the bag of words model on our title column bit before we do that we need to clean our data: 
- remove all stop words
- remove words with length < 4
- remove punctuations
"""

# We will now begin cleaning up our 2 columns namely "headline" and "short_description"
punctuations_list = [",", ":", ";", ".", "'", '"', "â€™", "?", "/", "-", "+", "&", "(", ")", "/"]
sw = stopwords.words('english')

#Here we have created a function that will clean up any given pandases series
def preprocess_textcol(name):
    st = PorterStemmer()
    df['new_{}'.format(name)] = df[name].apply(lambda x: ' '.join([word for word in x.split(' ') if word not in sw
                                                                             and word.lower() not in sw 
                                                                             and word.upper() not in sw
                                                                             and word.title() not in sw
                                                                         and word not in punctuations_list and (len(word)> 5) 
                                                                  and word != '']))  #numbers doesnt work    
    
    #applying stemming
    #df['new_{}'.format(name)] = df['new_{}'.format(name)].apply(lambda x: ' '.join([st.stem(word) for word in x.split(' ')]))
        
    return df['new_{}'.format(name)] 

#creating 2 new columns out of the original columns "headlines" and "short_description"
df['new_title'] = preprocess_textcol('title')

In [50]:
#view our new data
df['new_title'].head(10)

145665                        Emergency Landing Norway
88088     Sci-Hub Blowing Academic Publishing Industry
19198     Japanese cucumber farmer learning TensorFlow
9262                                            Adults
68881                          British Googling voting
101686                               fucking webmaster
149675                                 Gneural Network
256992                  abusing Unicode create tragedy
218569                    Bullshit Startup Experiences
11270                      Dynamic Programming: (1984)
Name: new_title, dtype: object

#### Next we need to create our bag of words model and create a count vector matrix

In [53]:
vectorizer = CountVectorizer(min_df=4)
data_corpus = list(df['new_title'].values.astype('U'))
X = vectorizer.fit_transform(data_corpus) 
print(X.toarray())
#print(vectorizer.get_feature_names())

training = pd.DataFrame(X.toarray(), columns =vectorizer.get_feature_names() )
training.shape
#test_df

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(2000, 302)

In [54]:
## Saving this to local so we deploy it in flask for the count vec function
try:
    deploy_df = df.drop(['num_points','num_comments','title'], axis=1)
except:
    print ('already dropped')

deploy_df.to_csv('C:/Users/Darshil/gitly/Deep-Learning/My Projects/Flask-app/project_specific/E2_predict_likes_hackernews/deploy_df.csv', index=False)

#### Last but not the least we need to train it using scikit learn 

In [55]:
from sklearn import linear_model

#call model, fit and train
model = linear_model.LinearRegression()
model.fit(training,df['num_points'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [59]:
# Now we predict!
test_val = vectorizer.transform(['google google google google algorithm algorithm', 'adults busy']) 
#print ('New value: ', test_val.toarray()[0])
print (model.predict(test_val.toarray()))

[517.79947579 197.01124095]


In [60]:
def convert_vec(text):    
    t = vectorizer.transform([text])
    return t.toarray()

#model.predict(convert_vec('google'))
convert_vec('text').shape

(1, 302)

#### This model is horrible we will need to train it on a much larger dataset
#### Next we need to save this model to deploy 

In [61]:
from sklearn.externals import joblib
joblib.dump(model,'predict_likes_hackernews')

['predict_likes_hackernews']

In [None]:
"""# Example code for count vectorization (from other project)
data_corpus = ["John likes to watch movies. Mary likes movies too.", 'darshil also likes to watch movies']
X = vectorizer.fit_transform(data_corpus) 
print(X.toarray())
print(vectorizer.get_feature_names())

test_df = pd.DataFrame(X.toarray(), columns =vectorizer.get_feature_names() )
test_df

#new values
new_val = vectorizer.transform(['hello likes to']) 
print ('New value: ', new_val.toarray()[0])

project1_env"""