In [1]:
import numpy as np
import pandas as pd 
import matplotlib 
import matplotlib.pyplot as plt


df = pd.read_csv("train.csv")
print(df.head())

        id                                               text author
0  id26305  This process, however, afforded me no means of...    EAP
1  id17569  It never once occurred to me that the fumbling...    HPL
2  id11008  In his left hand was a gold snuff box, from wh...    EAP
3  id27763  How lovely is spring As we looked from Windsor...    MWS
4  id12958  Finding nothing else, not even gold, the Super...    HPL


Writting syles differ in many ways but one such way is that some writers are wordier than others, and some use more or less punctuation than others. This seems especially relevant given that the writers being analysed use several different types of writing (Poe as a poet, Shelly as short stories, Lovecraft as a novelist). There is definitely cross over with these styles but I would argue that the stylistic aspects carry over throughout works so the following functions are designed to assess the writing sections for wordiness and punctuator usage.

In [3]:
import string


def get_word_counts(text):
    count = 0
    for i in text:
        if i == " ":
            count += 1
    return count


def get_punctuator_counts(text):
    punct_dict = {}
    for i in string.punctuation:
        p_count = text.count(i)
        punct_dict[i] = p_count
    return punct_dict


def get_punctuator_array(text):
    punctuator_array_TF = []
    for i in string.punctuation:
        if i in text: 
            punctuator_array_TF.append(1)
        else:
            punctuator_array_TF.append(0)
    return punctuator_array_TF

df["word count"] = df.text.apply(get_word_counts)
df["punctuator_count"] = df.text.apply(get_punctuator_counts)
df["punctuator_array"] = df.text.apply(get_punctuator_array)
df["temp1"] = df.punctuator_count.apply(lambda x: sum(x.values()))
df["temp2"] = df.punctuator_array.apply(lambda x: sum(x))

print(df.head())
print(df.groupby("author", as_index=False)["word count"].mean())
print(df.groupby("author", as_index=False)["temp1"].mean())
print(df.groupby("author", as_index=False)["temp2"].mean())

        id                                               text author  \
0  id26305  This process, however, afforded me no means of...    EAP   
1  id17569  It never once occurred to me that the fumbling...    HPL   
2  id11008  In his left hand was a gold snuff box, from wh...    EAP   
3  id27763  How lovely is spring As we looked from Windsor...    MWS   
4  id12958  Finding nothing else, not even gold, the Super...    HPL   

   word count                                   punctuator_count  \
0          40  {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': ...   
1          13  {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': ...   
2          35  {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': ...   
3          33  {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': ...   
4          26  {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': ...   

                                    punctuator_array  temp1  temp2  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...      7      3  
1  [0, 0, 0, 0, 0, 0

The data thus far does not look all that interesting but it does show that Poe tends to use slightly more punctuators and have fewer words (which makes sense for a poet), but also shows that HPL tends to use the least punctuation (somewhat surprising). It also reveals that wordcount/puntuation used might be a good metric to include. Additionally, another set of features that would be interesting to explore are metrics realated to readability. Here, this is looked at in brief with python's readability package. 

In [5]:
import readability


def get_readability_nums(textstr):
    pop_list = ['sentences_per_paragraph', 'type_token_ratio', 'sentences', 'paragraphs']
    res = dict(readability.getmeasures(text=textstr, merge=True))
    for i in pop_list:
        del res[i]
    return res

df["readability"] = df.text.apply(get_readability_nums)
keyslist = list(df.at[0, "readability"].keys())
for i in keyslist:
    df[i] = df.readability.apply(lambda x: x[i])
    print(df.groupby("author", as_index=False)[i].mean())

  author    Kincaid
0    EAP  11.345821
1    HPL  11.949075
2    MWS  11.641965
  author        ARI
0    EAP  12.290581
1    HPL  13.636678
2    MWS  13.181049
  author  Coleman-Liau
0    EAP      8.503033
1    HPL      9.137809
2    MWS      8.636868
  author  FleschReadingEase
0    EAP          59.146649
1    HPL          59.136985
2    MWS          60.534699
  author  GunningFogIndex
0    EAP        15.320904
1    HPL        15.412072
2    MWS        15.698662
  author        LIX
0    EAP  45.796967
1    HPL  47.179099
2    MWS  47.241839
  author  SMOGIndex
0    EAP  11.618737
1    HPL  11.331048
2    MWS  11.659334
  author       RIX
0    EAP  5.199747
1    HPL  5.483230
2    MWS  5.328756
  author  characters_per_word
0    EAP             4.451775
1    HPL             4.480366
2    MWS             4.431556
  author  syll_per_word
0    EAP       1.439720
1    HPL       1.410766
2    MWS       1.399661
  author  words_per_sentence
0    EAP           25.505443
1    HPL           27.

  author  syllables
0    EAP  37.118481
1    HPL  39.758296
2    MWS  38.444408
  author      words
0    EAP  25.505443
1    HPL  27.928305
2    MWS  27.476837
  author  wordtypes
0    EAP  21.744937
1    HPL  24.472760
2    MWS  23.404037
  author  long_words
0    EAP    5.199747
1    HPL    5.483230
2    MWS    5.328756
  author  complex_words
0    EAP       3.331139
1    HPL       3.012422
2    MWS       3.189940
  author  tobeverb
0    EAP  0.898734
1    HPL  0.772671
2    MWS  0.884844
  author   auxverb
0    EAP  0.306962
1    HPL  0.288731
2    MWS  0.446393
  author  conjunction
0    EAP     0.067975
1    HPL     0.064596
2    MWS     0.057743
  author   pronoun
0    EAP  0.310759
1    HPL  0.289796
2    MWS  0.386168
  author  preposition
0    EAP     0.155443
1    HPL     0.140728
2    MWS     0.109696


  author  nominalization
0    EAP        0.470506
1    HPL        0.259272
2    MWS        0.434646
  author  interrogative
0    EAP       0.023924
1    HPL       0.026264
2    MWS       0.031602
  author   article
0    EAP  0.138228
1    HPL  0.111269
2    MWS  0.104070
  author  subordination
0    EAP       0.027342
1    HPL       0.034073
2    MWS       0.036069


Features now need to be pipelined and labels seperated out. Important features will be pickedinitially using a decision tree to select for the features with the greatest information 
gain. Feature transformation will occur later on after there is a more solid grasp on the importance of individual features.

In [6]:
text_features = df.drop(columns=["author", "punctuator_count", "punctuator_array", 
                                 "readability", "id", "text"])
text_labels = df["author"].copy()
print(text_features.describe())

         word count         temp1         temp2       Kincaid           ARI  \
count  19579.000000  19579.000000  19579.000000  19579.000000  19579.000000   
mean      25.730477      3.759283      2.229481     11.610862     12.952885   
std       19.048353      3.009744      0.857384      8.095257      9.972436   
min        1.000000      1.000000      1.000000    -14.810000     -8.510000   
25%       14.000000      2.000000      2.000000      6.727143      6.942500   
50%       22.000000      3.000000      2.000000     10.680000     11.572174   
75%       33.000000      5.000000      3.000000     15.327500     17.182973   
max      860.000000     71.000000      7.000000    336.810453    429.797282   

       Coleman-Liau  FleschReadingEase  GunningFogIndex           LIX  \
count  19579.000000       19579.000000     19579.000000  19579.000000   
mean       8.727042          59.572356        15.463756     46.640785   
std        3.585138          28.608999         8.373508     21.600184

In [7]:
from sklearn.preprocessing import RobustScaler


scaler = RobustScaler()
for column in text_features.columns[2:]:
    shaped = np.array(text_features[column]).reshape(-1, 1)
    scaler.fit(shaped)
    text_features[column] = scaler.transform(shaped)
print(text_features.head())
test = text_features[['SMOGIndex', 'RIX', 'characters_per_word', 'syll_per_word']].copy()
print(test.describe())

   word count  temp1  temp2   Kincaid       ARI  Coleman-Liau  \
0          40      7    1.0  0.611773  0.843261      0.226461   
1          13      1   -1.0 -0.949628 -0.666546     -0.662584   
2          35      5    0.0  0.293011  0.579188      0.144228   
3          33      4    0.0  0.383910  0.723531      0.849288   
4          26      4    1.0  0.507386  0.548590      1.290869   

   FleschReadingEase  GunningFogIndex       LIX  SMOGIndex      ...        \
0          -0.186881         0.841602  0.909657   1.035276      ...         
1           1.221017        -0.642654 -0.434998  -0.414214      ...         
2           0.127191         0.474832  0.463298   0.585786      ...         
3          -0.140649         0.037191  0.820865  -0.414214      ...         
4          -0.727376         0.246917  0.682707   0.585786      ...         

   complex_words  tobeverb  auxverb  conjunction  pronoun  preposition  \
0       1.333333       0.0      1.0          0.0      1.0          0.0  

It is unclear at this point exactly how well a robust scaler will work on some of the 
features such as word types, word complexity, verb count etc. So this will be revisited
after training inital models.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer


model1 = DecisionTreeClassifier()
model1.fit(text_features, text_labels)
guess = model1.predict(text_features)

print(accuracy_score(text_labels, guess))

parameters = {"max_depth": (1, 10, 100, 1000, None)}
cv_sets = ShuffleSplit(n_splits= 10, test_size = 0.20, random_state = 0)
scorer = make_scorer(score_func=accuracy_score)
grid_obj = GridSearchCV(estimator=model1, scoring=scorer, param_grid=parameters, 
                        cv=cv_sets)
grid_obj.fit(text_features, text_labels)

print(grid_obj.best_score_)


0.998212370397


AttributeError: 'GridSearchCV' object has no attribute 'best_score_'