In [None]:
!pip install textstat

In [None]:
import numpy as np
import pandas as pd
import textstat
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
train_data = pd.read_csv('train.csv') # 2834 rows, 6 cols
test_data = pd.read_csv('test.csv')   #    7 rows, 4 cols

In [None]:
def compute_features(df):
    df['flesch_re'] = df.apply (lambda row: textstat.flesch_reading_ease(row['excerpt']), axis=1)
    df['flesch_kg'] = df.apply (lambda row: textstat.flesch_kincaid_grade(row['excerpt']), axis=1)
    df['coleman'] = df.apply (lambda row: textstat.coleman_liau_index(row['excerpt']), axis=1)
    df['auto_idx'] = df.apply (lambda row: textstat.automated_readability_index(row['excerpt']), axis=1)
    df['linsear'] = df.apply (lambda row: textstat.linsear_write_formula(row['excerpt']), axis=1)
    df['gunning_fog'] = df.apply (lambda row: textstat.gunning_fog(row['excerpt']), axis=1)
    df['dale_chall'] = df.apply (lambda row: textstat.dale_chall_readability_score(row['excerpt']), axis=1)

compute_features(train_data)
compute_features(test_data)

In [None]:
features = ['flesch_re', 'flesch_kg', 'coleman', 'auto_idx', 'linsear', 'gunning_fog', 'dale_chall']

train_X = train_data[features]
train_y = train_data['target']
test_X = test_data[features]

# Scale feature
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)

In [None]:
models = {
    'linear_reg': LinearRegression(),
    'svr_rbf'   : SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1),
    'ridge'     : Ridge(alpha=1.0)
}

for name, model in models.items():
    print('\n' + name)
    scores = cross_val_score(model, train_X, train_y, cv=5, scoring='neg_root_mean_squared_error')
    print(scores)
    print('avg:', np.average(scores))


linear_reg
[-0.83549822 -0.94292949 -0.87820843 -0.87407251 -0.77085161]
avg: -0.8603120528063446

svr_rbf
[-0.84336044 -0.94113135 -0.85097222 -0.82722789 -0.79444122]
avg: -0.8514266239738664

ridge
[-0.83505734 -0.94155316 -0.87804504 -0.87458208 -0.77052283]
avg: -0.8599520891619378


In [None]:
# Train with all training set
model = models['svr_rbf']
model.fit(train_X, train_y)

# Predict blind test set
test_data['target'] = model.predict(test_X)
submission = test_data[['id', 'target']]
submission.to_csv('submission.csv', index=False)
print(submission)

          id    target
0  c0f722661 -0.604432
1  f0953f0a5 -0.245682
2  0df072751 -0.894516
3  04caf4e0c -1.809409
4  0e63f8bea -2.175466
5  12537fe78 -0.679070
6  965e592c0  0.054704
