In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.1-py3-none-any.whl (99 kB)
[?25l[K     |███▎                            | 10 kB 24.4 MB/s eta 0:00:01[K     |██████▋                         | 20 kB 27.5 MB/s eta 0:00:01[K     |█████████▉                      | 30 kB 12.8 MB/s eta 0:00:01[K     |█████████████▏                  | 40 kB 9.6 MB/s eta 0:00:01[K     |████████████████▍               | 51 kB 5.2 MB/s eta 0:00:01[K     |███████████████████▊            | 61 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████         | 71 kB 5.8 MB/s eta 0:00:01[K     |██████████████████████████▎     | 81 kB 6.3 MB/s eta 0:00:01[K     |█████████████████████████████▌  | 92 kB 6.2 MB/s eta 0:00:01[K     |████████████████████████████████| 99 kB 4.0 MB/s 
[?25hCollecting pyphen
  Downloading pyphen-0.11.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 9.5 MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.11.

In [None]:
import numpy as np
import pandas as pd
import textstat
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
internal_train_data = pd.read_csv('internal_train.csv') # 2434 rows, 6 cols
internal_test_data = pd.read_csv('internal_test.csv')   #  400 rows, 6 cols
full_train_data = pd.read_csv('train.csv')              # 2834 rows, 6 cols
blind_test_data = pd.read_csv('test.csv')               #    7 rows, 4 cols

In [None]:
def compute_features(df):
    df['flesch_re'] = df.apply (lambda row: textstat.flesch_reading_ease(row['excerpt']), axis=1)
    df['flesch_kg'] = df.apply (lambda row: textstat.flesch_kincaid_grade(row['excerpt']), axis=1)
    df['coleman'] = df.apply (lambda row: textstat.coleman_liau_index(row['excerpt']), axis=1)
    df['auto_idx'] = df.apply (lambda row: textstat.automated_readability_index(row['excerpt']), axis=1)
    df['linsear'] = df.apply (lambda row: textstat.linsear_write_formula(row['excerpt']), axis=1)
    df['gunning_fog'] = df.apply (lambda row: textstat.gunning_fog(row['excerpt']), axis=1)
    df['dale_chall'] = df.apply (lambda row: textstat.dale_chall_readability_score(row['excerpt']), axis=1)

compute_features(internal_train_data)
compute_features(internal_test_data)
compute_features(full_train_data)
compute_features(blind_test_data)

In [None]:
features = ['flesch_re', 'flesch_kg', 'coleman', 'auto_idx', 'linsear', 'gunning_fog', 'dale_chall']

full_train_X = full_train_data[features]
internal_train_X = internal_train_data[features]
internal_test_X = internal_test_data[features]
blind_test_X = blind_test_data[features]

# Scale feature
scaler = StandardScaler().fit(full_train_X)
full_train_X = scaler.transform(full_train_X)
internal_train_X = scaler.transform(internal_train_X)
internal_test_X = scaler.transform(internal_test_X)
blind_test_X = scaler.transform(blind_test_X)

In [None]:
models = {
    'linear_reg': LinearRegression(),
    'svr_rbf'   : SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1),
    'ridge'     : Ridge(alpha=1.0)
}

for name, model in models.items():
    print('\n' + name)
    scores = cross_val_score(model, internal_train_X, internal_train_data['target'], cv=5, scoring='neg_root_mean_squared_error')
    print(scores)
    print('avg:', np.average(scores))


linear_reg
[-0.84836944 -0.83090355 -0.80754878 -0.79271503 -0.86242078]
avg: -0.8283915168503325

svr_rbf
[-0.84782461 -0.82681697 -0.79446664 -0.78550343 -0.87172306]
avg: -0.825266943011688

ridge
[-0.84869737 -0.83054144 -0.80727834 -0.79269704 -0.86201346]
avg: -0.8282455322047673


SVR shows the least RMSE, so we will use it as our final model. To approximate its performance, we test it with the internal test set.

In [None]:
model = models['svr_rbf']
model.fit(internal_train_X, internal_train_data['target'])
y_pred = model.predict(internal_test_X)
print(mean_squared_error(internal_test_data['target'], y_pred, squared=False))

0.8008069258529285


In [None]:
# Train with all training set
model.fit(full_train_X, full_train_data['target'])

# Predict blind test set
blind_test_data['target'] = model.predict(blind_test_X)
submission = blind_test_data[['id', 'target']]
submission.to_csv('submission.csv', index=False)
print(submission)

          id    target
0  c0f722661 -0.604432
1  f0953f0a5 -0.245682
2  0df072751 -0.894516
3  04caf4e0c -1.809409
4  0e63f8bea -2.175466
5  12537fe78 -0.679070
6  965e592c0  0.054704
