# Setting Parameters

In [None]:
import pandas as pd
import numpy as np
import sqlite3
import pickle
from textblob import Word
import gc
import matplotlib.pyplot as plt
import re
from collections import Counter
import math
%matplotlib inline 
from sklearn.metrics import mean_squared_error

#size of reddit posts in terms of no of posts to fetch from database
no_of_posts = 100000
chunk_size = 100

#  Setting column names for training data frame

In [None]:
# reading data from sql database reddit_data.db
con = sqlite3.connect("reddit_data.db")
sql_query = "select title, subreddit, domain from reddit_posts where random_number != 36 or random_number != 53 or random_number != 24 limit 100000"
df = pd.read_sql(sql_query, con)
con.close()


unique_values_subreddit = df["subreddit"].unique().tolist()
unique_values_subreddit = ["subreddit_" + str(s) for s in unique_values_subreddit]
unique_values_domain = df["domain"].unique().tolist()
unique_values_domain = ["domain_" + s for s in unique_values_domain]

df["title"]=df["title"].map(lambda x: x.lower())
df["title"] = df["title"].map(lambda x: re.sub('[^a-zA-Z ]', '', x))
df["title"] = df["title"].map(lambda x: x.split())
df["title"] = df["title"].map(lambda x: [i for i in x if (len(i)>3 and len(i)<10)])

from nltk.corpus import stopwords
stop = stopwords.words('english')

df["title"] = df["title"].map(lambda x: [i for i in x if i not in stop])
df["title"] = df["title"].map(lambda x: [str(i) for i in x])

lists = df["title"].tolist()
unique_values_title = list(set([item for sublist in lists for item in sublist]))
training_df_column_names = unique_values_domain + unique_values_subreddit + unique_values_title

#saving the list of training_df_column_names in a pkl file:
with open("training_df_column_names.pkl","wb") as f:
    pickle.dump(training_df_column_names, f)

# Tf- idf Values for words in title

In [None]:
#tf_idf dictionary
tfidf_dict ={}
for w in unique_values_title:
    tfidf_dict[w] = len([True for sublist in lists if w in sublist])

for key in tfidf_dict:
    tfidf_dict[key] = math.log( (no_of_posts / (1+tfidf_dict[key]) ))

with open("tfidf_dict.pkl",'wb') as f:
    pickle.dump(tfidf_dict, f)

# Preparing data for model training

In [None]:
def df_format(df):
    if "title" in df.columns:
        df["title"]=  df["title"].map(lambda x: x.lower())
        df["title"] = df["title"].map(lambda x: re.sub('[^a-zA-Z ]', '', x))
        df["title"] = df["title"].map(lambda x: x.split())
        df["title"] = df["title"].map(lambda x: [i for i in x if (len(i)>3 and len(i)<10)])
        from nltk.corpus import stopwords
        stop = stopwords.words('english')
        df["title"] = df["title"].map(lambda x: [i for i in x if i not in stop])
        df["title"] = df["title"].map(lambda x: [str(i) for i in x])
    if "domain" in df.columns:
        df["domain"] = df["domain"].map(lambda x: "domain_"+str(x))
    if "subreddit" in df.columns:
        df["subreddit"] = df["subreddit"].map(lambda x: "subreddit_"+str(x))
    return df

#loading the training_df_column_names.pkl
with open("training_df_column_names.pkl","rb") as f:
    training_df_column_names = pickle.load(f)

#loading tf-idf values for all words in title
with open("tfidf_dict.pkl","rb") as f:
    tfidf_dict = pickle.load(f)

#Reading the training values from database
con = sqlite3.connect("reddit_data.db")
sql_query = "select title, subreddit, domain from reddit_posts where random_number != 36 or random_number != 53 or random_number != 24 limit 100000"

df = pd.read_sql(sql_query, con, chunksize=100)
print('sql')
for i,d in enumerate(df):
    training_df = pd.DataFrame(0, index=np.arange(int(d.shape[0])), columns = training_df_column_names)
    d = df_format(d)
    for index, row in d.iterrows():
        
        if str(row["domain"]) in training_df_column_names:
            training_df.loc[index, row["domain"]] += 1
        if str(row["subreddit"]) in training_df_column_names:
            training_df.loc[index, row["subreddit"]] += 1
        for w in row["title"]:
            if w in training_df_column_names:
                training_df.loc[index,(w)] += 1
    
    for w in list(set([item for sublist in d["title"].tolist()  for item in sublist])):
        training_df[w] = training_df[w]*tfidf_dict[w]
    name_training = 'msg_data/training_df_'+str(i)+'.msg'
    name_ups = 'msg_data/ups_df_'+str(i)+'.msg'
    d["ups"].to_msgpack(name_ups, use_bin_type=True)
    training_df.to_msgpack(name_training, use_bin_type=True)
    gc.collect()
    del training_df
con.close()
gc.collect()


# Training Regression Model

In [None]:
from sklearn.linear_model import SGDRegressor
import pandas as pd
import numpy as np
import math


model = SGDRegressor(alpha=0.000050)

for i in range(no_of_posts/chunk_size):
    
    x = pd.read_msgpack('msg_data/training_df_'+str(i)+'.msg')
        
    y = pd.read_msgpack('msg_data/ups_df_'+str(i)+'.msg')
    y=y.apply(lambda x: math.log(x) if x!=0 else x ).as_matrix()
    model.partial_fit(x,y)
    
#model_list.append(model)
filename = 'model_trained.pkl'
with open(filename, "wb") as f:
    pickle.dump(model, f)