In [1]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import LabelEncoder
pd.options.display.float_format = '{:.8f}'.format
import os, sys
from fastai.imports import *
from fastai.structured import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error

In [2]:
test = pd.read_csv('test.csv', error_bad_lines=False, sep='\t')
train = pd.read_csv('train.csv', error_bad_lines=False, sep='\t')

In [3]:
y = train.answer_score

In [4]:
train = train.drop(['answer_score'], axis=1)

In [5]:
train.shape, test.shape

((852885, 8), (663082, 8))

In [6]:
train['question_utc_transofrmed'] = pd.to_datetime(train['question_utc'],unit='s')
train['answer_utc_transofrmed'] = pd.to_datetime(train['answer_utc'],unit='s')
test['question_utc_transofrmed'] = pd.to_datetime(test['question_utc'],unit='s')
test['answer_utc_transofrmed'] = pd.to_datetime(test['answer_utc'],unit='s')

In [7]:
def add_dateparts(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Week', 'Day', 'Dayofweek', 'Dayofyear'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    if drop: df.drop(fldname, axis=1, inplace=True)

In [8]:
add_dateparts(train, 'answer_utc_transofrmed')
add_dateparts(train, 'question_utc_transofrmed')
add_dateparts(test, 'answer_utc_transofrmed')
add_dateparts(test, 'question_utc_transofrmed')

In [9]:
train['answ_quest_dif'] = train['answer_utc'] - train['question_utc']
test['answ_quest_dif'] = test['answer_utc'] - test['question_utc']

In [10]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super(FeatureEngineer, self).__init__()

    def fit(self, data, *args, **kwargs):
        return self

    def transform(self, data, *args, **kwargs):
        data['log_len'] = np.log(data["answer_text"].apply(len))
        
        data["question_mark_count"] = [s.count("?") for s in data["answer_text"].values]
        data["question_mark_present"] = (data["question_mark_count"] > 0).astype(int)
        
        data["exclamation_mark_count"] = [s.count("!") for s in data["answer_text"].values]
        data["exclamation_mark_present"] = (data["exclamation_mark_count"] > 0).astype(int)
        
        data["log_question_score"] = np.log(data["question_score"] + 1)
        
        def count_capitalized(text):
            return sum(1 for c in text if c.isupper())

        data["ratio_capitalized"] = data["answer_text"].apply(count_capitalized) / data["answer_text"].apply(len)
        
        return data

In [11]:
featureEngineer = FeatureEngineer()
train = featureEngineer.transform(train)

In [12]:
test = featureEngineer.transform(test)

In [13]:
def n_answers(df):
    temp = (df['question_id'].value_counts()
    .reset_index().rename(columns={'question_id':'answers_count', 'index':'question_id'}))
    df = df.merge(temp, on=['question_id'],how='left')
    return df

In [14]:
train = n_answers(train)
test = n_answers(test)

In [15]:
url_regex = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
img_regex = 'https?:[^)''"]+\.(?:jpg|jpeg|gif|png)'

In [16]:
for df in [train, test]:
    df["text_length"] = df["answer_text"].apply(lambda x: len(x))
    df["answer_imgs"] = df["answer_text"].apply(lambda x: len(re.findall(img_regex, x))) #number of imgs in answer
    df["answer_links"] = df["answer_text"].apply(lambda x: len(re.findall(url_regex, x))) #number of links  that are not imgs
    df["answer_links"] = df["answer_links"] - df["answer_imgs"]
    df.answer_imgs = df.answer_imgs.apply(lambda x: 6 if x > 6 else x)
    df.answer_links = df.answer_links.apply(lambda x: 10 if x > 10 else x)

In [17]:
train.shape, test.shape

((852885, 28), (663082, 28))

In [18]:
concat_text = pd.concat([train.answer_text, test.answer_text, train.question_text, test.question_text])

In [19]:
tv = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', 
                     analyzer='word', token_pattern=r'\w{1,}', 
                     ngram_range=(1, 1), stop_words='english', max_features=None)

In [20]:
tv.fit(concat_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [21]:
train_text_question_features = tv.transform(train.question_text)
train_text_answer_features = tv.transform(train.answer_text)
test_text_question_features = tv.transform(test.question_text)
test_text_answer_features = tv.transform(test.answer_text)

In [22]:
lb = LabelBinarizer(sparse_output=True)

In [23]:
cv = CountVectorizer()

In [24]:
qid = pd.concat([train.question_id, test.question_id])
subr = pd.concat([train.subreddit, test.subreddit])

In [25]:
cv.fit(qid)
tr_qid = cv.transform(train.question_id)
te_qid = cv.transform(test.question_id)

In [26]:
lb.fit(subr)
tr_subr = lb.transform(train.subreddit)
te_subr = lb.transform(test.subreddit)

In [27]:
train_d = train.drop(['question_text', 'answer_text', 'question_id', 'subreddit', 'id'],axis=1)
test_d = test.drop(['question_text', 'answer_text', 'question_id', 'subreddit', 'id'], axis=1)
train_d.shape, test_d.shape

((852885, 23), (663082, 23))

In [31]:
train_d.head()

Unnamed: 0,question_utc,question_score,answer_utc,answer_utc_transofrmedWeek,answer_utc_transofrmedDay,answer_utc_transofrmedDayofweek,answer_utc_transofrmedDayofyear,question_utc_transofrmedWeek,question_utc_transofrmedDay,question_utc_transofrmedDayofweek,...,question_mark_count,question_mark_present,exclamation_mark_count,exclamation_mark_present,log_question_score,ratio_capitalized,answers_count,text_length,answer_imgs,answer_links
0,1512086400,179,1512086616,48,1,4,335,48,1,4,...,0,0,0,0,5.19295685,0.03389831,9,59,0,0
1,1512086400,179,1512090694,48,1,4,335,48,1,4,...,0,0,0,0,5.19295685,0.02666667,9,75,0,0
2,1512086400,179,1512090763,48,1,4,335,48,1,4,...,1,1,0,0,5.19295685,0.05769231,9,52,0,0
3,1512086400,179,1512091349,48,1,4,335,48,1,4,...,0,0,0,0,5.19295685,0.02564103,9,78,0,0
4,1512086400,179,1512133012,48,1,4,335,48,1,4,...,0,0,0,0,5.19295685,0.0875,9,160,0,0


In [32]:
x = hstack((train_text_question_features, train_text_answer_features, tr_subr, train_d))

In [33]:
x_test = hstack((test_text_question_features, test_text_answer_features, te_subr, test_d))

In [256]:
train_X, valid_X, train_y, valid_y = train_test_split(x, np.log1p(y), test_size = 0.15, random_state = 42) 
d_train = lgb.Dataset(train_X, label=train_y)
d_valid = lgb.Dataset(valid_X, label=valid_y)
watchlist = [d_train, d_valid]
params = {
        'learning_rate': 0.01,
        'application': 'regression',
        'max_depth': 12,
        'num_leaves': 5046,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 1,
        'nthread': 8,
        'max_bin': 819,
        
}
model = lgb.train(params, train_set=d_train, num_boost_round=2000, valid_sets=watchlist, \
    early_stopping_rounds=50, verbose_eval=10) 

Training until validation scores don't improve for 50 rounds.
[10]	training's rmse: 165.312	valid_1's rmse: 168.329
[20]	training's rmse: 156.107	valid_1's rmse: 164.696
[30]	training's rmse: 152.985	valid_1's rmse: 164.17
[40]	training's rmse: 151.133	valid_1's rmse: 164.145
[50]	training's rmse: 150.069	valid_1's rmse: 164.009
Did not meet early stopping. Best iteration is:
[50]	training's rmse: 150.069	valid_1's rmse: 164.009


In [None]:
predL = model.predict(x_test)

In [200]:
s = pd.read_csv('sample_submission.csv')
s.answer_score = np.expm1(predL)
s.to_csv('4.csv', index=False)