# Importing libraries

In [1]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import LabelEncoder
pd.options.display.float_format = '{:.8f}'.format
import os, sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from fastai.imports import *
from fastai.structured import *

# Reading Files

In [2]:
test = pd.read_csv('test.csv', error_bad_lines=False, sep='\t')
train = pd.read_csv('train.csv', error_bad_lines=False, sep='\t')

In [3]:
train.head(5)

Unnamed: 0,id,question_id,subreddit,question_utc,question_text,question_score,answer_utc,answer_text,answer_score
0,330435,f48a5420fa6a66ecec95365dd67bdc99,AskReddit,1512086400,All my buddies love Pacific Rim and always tal...,179,1512086616,"I didn't really care for it, I thought the pac...",70
1,1944975,f48a5420fa6a66ecec95365dd67bdc99,AskReddit,1512086400,All my buddies love Pacific Rim and always tal...,179,1512090694,"I thought it was hilarious, and I didn't reali...",24
2,2218735,f48a5420fa6a66ecec95365dd67bdc99,AskReddit,1512086400,All my buddies love Pacific Rim and always tal...,179,1512090763,"&gt; Pacific Rim The porno, or the studio rel...",11
3,2133251,f48a5420fa6a66ecec95365dd67bdc99,AskReddit,1512086400,All my buddies love Pacific Rim and always tal...,179,1512091349,I enjoyed it more than most movies of that gen...,42
4,2133252,f48a5420fa6a66ecec95365dd67bdc99,AskReddit,1512086400,All my buddies love Pacific Rim and always tal...,179,1512133012,"Its terrible, its clearly made to be a self aw...",1


## Defining our target column and excluding it from our data set

In [4]:
y = train['answer_score']
train = train.drop(['answer_score'], axis=1)
train.shape, test.shape

((852885, 8), (663082, 8))

### Assigning timeseries columns a datetime format

In [5]:
train['question_utc_transofrmed'] = pd.to_datetime(train['question_utc'],unit='s')
train['answer_utc_transofrmed'] = pd.to_datetime(train['answer_utc'],unit='s')
test['question_utc_transofrmed'] = pd.to_datetime(test['question_utc'],unit='s')
test['answer_utc_transofrmed'] = pd.to_datetime(test['answer_utc'],unit='s')

### Creating new features from timseries like number of week, day, day of week, day of year

In [6]:
def add_dateparts(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Week', 'Day', 'Dayofweek', 'Dayofyear'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    if drop: df.drop(fldname, axis=1, inplace=True)

In [7]:
add_dateparts(train, 'answer_utc_transofrmed')
add_dateparts(train, 'question_utc_transofrmed')
add_dateparts(test, 'answer_utc_transofrmed')
add_dateparts(test, 'question_utc_transofrmed')

In [8]:
train['answ_quest_dif'] = train['answer_utc'] - train['question_utc']
test['answ_quest_dif'] = test['answer_utc'] - test['question_utc']

### More feature engineering to extract more data from the text given

In [9]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(FeatureEngineer, self).__init__()

    def fit(self, data, *args, **kwargs):
        return self

    def transform(self, data, *args, **kwargs):
        data['log_len'] = np.log(data["answer_text"].apply(len))
        
        data["question_mark_count"] = [s.count("?") for s in data["answer_text"].values]
        data["question_mark_present"] = (data["question_mark_count"] > 0).astype(int)
        
        data["exclamation_mark_count"] = [s.count("!") for s in data["answer_text"].values]
        data["exclamation_mark_present"] = (data["exclamation_mark_count"] > 0).astype(int)
        
        data["log_question_score"] = np.log(data["question_score"] + 1)
        
        def count_capitalized(text):
            return sum(1 for c in text if c.isupper())

        data["ratio_capitalized"] = data["answer_text"].apply(count_capitalized) / data["answer_text"].apply(len)
        
        return data

In [10]:
featureEngineer = FeatureEngineer()
train = featureEngineer.transform(train)
test = featureEngineer.transform(test)

In [11]:
def n_answers(df):
    temp = (df['question_id'].value_counts()
    .reset_index().rename(columns={'question_id':'answers_count', 'index':'question_id'}))
    df = df.merge(temp, on=['question_id'],how='left')
    return df

In [12]:
train = n_answers(train)
test = n_answers(test)

In [13]:
url_regex = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
img_regex = 'https?:[^)''"]+\.(?:jpg|jpeg|gif|png)'

In [14]:
for df in [train, test]:
    df["text_length"] = df["answer_text"].apply(lambda x: len(x))
    df["answer_imgs"] = df["answer_text"].apply(lambda x: len(re.findall(img_regex, x))) #number of imgs in answer
    df["answer_links"] = df["answer_text"].apply(lambda x: len(re.findall(url_regex, x))) #number of links  that are not imgs
    df["answer_links"] = df["answer_links"] - df["answer_imgs"]
    df.answer_imgs = df.answer_imgs.apply(lambda x: 6 if x > 6 else x)
    df.answer_links = df.answer_links.apply(lambda x: 10 if x > 10 else x)

In [15]:
train.shape, test.shape

((852885, 28), (663082, 28))

### Creating vectors for NLP problem

In [16]:
concat_text = pd.concat([train.answer_text, test.answer_text, train.question_text, test.question_text])

In [17]:
tv = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', 
                     analyzer='word', token_pattern=r'\w{1,}', 
                     ngram_range=(1, 1), stop_words='english', max_features=None)

In [18]:
tv.fit(concat_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
train_text_question_features = tv.transform(train.question_text)
train_text_answer_features = tv.transform(train.answer_text)
test_text_question_features = tv.transform(test.question_text)
test_text_answer_features = tv.transform(test.answer_text)

In [20]:
lb = LabelBinarizer(sparse_output=True)

In [21]:
cv = CountVectorizer()

In [22]:
qid = pd.concat([train.question_id, test.question_id])
subr = pd.concat([train.subreddit, test.subreddit])

In [23]:
cv.fit(qid)
tr_qid = cv.transform(train.question_id)
te_qid = cv.transform(test.question_id)

In [24]:
lb.fit(subr)
tr_subr = lb.transform(train.subreddit)
te_subr = lb.transform(test.subreddit)

In [25]:
train_d = train.drop(['question_text', 'answer_text', 'question_id', 'subreddit', 'id'],axis=1)
test_d = test.drop(['question_text', 'answer_text', 'question_id', 'subreddit', 'id'], axis=1)
train_d.shape, test_d.shape

((852885, 23), (663082, 23))

### Stacking our dataset ang processed matrices

In [26]:
x = hstack((train_text_question_features, train_text_answer_features, tr_subr, train_d))

In [27]:
x_test = hstack((test_text_question_features, test_text_answer_features, te_subr, test_d))

# Model

In [28]:
train_X, valid_X, train_y, valid_y = train_test_split(x, np.log1p(y), test_size = 0.15, random_state = 42) 
d_train = lgb.Dataset(train_X, label=train_y)
d_valid = lgb.Dataset(valid_X, label=valid_y)
watchlist = [d_train, d_valid]
params = {
        'learning_rate': 0.01,
        'application': 'regression',
        'max_depth': 12,
        'num_leaves': 140,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 1,
        'nthread': 8
        
}
model = lgb.train(params, train_set=d_train, num_boost_round=2000, valid_sets=watchlist, \
    early_stopping_rounds=50, verbose_eval=10) 

Training until validation scores don't improve for 50 rounds.
[10]	training's rmse: 1.03785	valid_1's rmse: 1.03788
[20]	training's rmse: 0.999702	valid_1's rmse: 1.00004
[30]	training's rmse: 0.966954	valid_1's rmse: 0.967524
[40]	training's rmse: 0.938917	valid_1's rmse: 0.939722
[50]	training's rmse: 0.914935	valid_1's rmse: 0.915986
[60]	training's rmse: 0.894418	valid_1's rmse: 0.8957
[70]	training's rmse: 0.876901	valid_1's rmse: 0.878464
[80]	training's rmse: 0.861943	valid_1's rmse: 0.863772
[90]	training's rmse: 0.849143	valid_1's rmse: 0.851271
[100]	training's rmse: 0.838161	valid_1's rmse: 0.840586
[110]	training's rmse: 0.828723	valid_1's rmse: 0.831445
[120]	training's rmse: 0.820637	valid_1's rmse: 0.823657
[130]	training's rmse: 0.813656	valid_1's rmse: 0.816974
[140]	training's rmse: 0.807577	valid_1's rmse: 0.811219
[150]	training's rmse: 0.802319	valid_1's rmse: 0.80628
[160]	training's rmse: 0.797711	valid_1's rmse: 0.801947
[170]	training's rmse: 0.793663	valid_1's

[1430]	training's rmse: 0.726877	valid_1's rmse: 0.751551
[1440]	training's rmse: 0.726748	valid_1's rmse: 0.751499
[1450]	training's rmse: 0.726637	valid_1's rmse: 0.751459
[1460]	training's rmse: 0.726538	valid_1's rmse: 0.751424
[1470]	training's rmse: 0.726412	valid_1's rmse: 0.751375
[1480]	training's rmse: 0.726271	valid_1's rmse: 0.751314
[1490]	training's rmse: 0.726161	valid_1's rmse: 0.751282
[1500]	training's rmse: 0.726028	valid_1's rmse: 0.751225
[1510]	training's rmse: 0.725904	valid_1's rmse: 0.751186
[1520]	training's rmse: 0.725788	valid_1's rmse: 0.751139
[1530]	training's rmse: 0.72564	valid_1's rmse: 0.751067
[1540]	training's rmse: 0.725525	valid_1's rmse: 0.751019
[1550]	training's rmse: 0.725406	valid_1's rmse: 0.750985
[1560]	training's rmse: 0.725295	valid_1's rmse: 0.750951
[1570]	training's rmse: 0.725182	valid_1's rmse: 0.750906
[1580]	training's rmse: 0.725085	valid_1's rmse: 0.750869
[1590]	training's rmse: 0.724982	valid_1's rmse: 0.75083
[1600]	training'

In [None]:
predL = model.predict(x_test)

In [200]:
s = pd.read_csv('sample_submission.csv')
s.answer_score = np.expm1(predL)
s.to_csv('4.csv', index=False)