In [1]:
import pandas as pd

In [2]:
horror_data = pd.read_csv('Data/HorrorAuthor/train.csv')

In [3]:
horror_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [28]:
horror_data.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

#### Remove everything apart from alphanumeric or space, convert to lowercase

In [30]:
import re
horror_data['cleaned'] = horror_data['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))

In [32]:
horror_data.head()

Unnamed: 0,id,text,author,cleaned
0,id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...
1,id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...
3,id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...


In [33]:
horror_data['num_words'] = horror_data['cleaned'].str.len()

In [34]:
horror_data.head()

Unnamed: 0,id,text,author,cleaned,num_words
0,id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224
1,id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195
3,id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170


In [37]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

text = Pipeline([
                ('selector', TextSelector(key='cleaned')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

num = Pipeline([
                ('selector', NumberSelector(key='num_words')),
                ('ss', StandardScaler())
            ])

text.fit_transform(horror_data)

<19579x25077 sparse matrix of type '<class 'numpy.float64'>'
	with 220642 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.pipeline import FeatureUnion

In [53]:
fu = FeatureUnion([
    ('text',text),
    ('num',num)
])

In [54]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
    ('fu',fu),
    ('clsf',LogisticRegression())
])

In [55]:
fu.fit_transform(horror_data)

<19579x25078 sparse matrix of type '<class 'numpy.float64'>'
	with 240221 stored elements in Compressed Sparse Row format>

In [56]:
pipe.fit(horror_data,horror_data.author)

Pipeline(memory=None,
     steps=[('fu', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='cleaned')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [57]:
pipe.predict(horror_data.iloc[:2])

array(['EAP', 'EAP'], dtype=object)

In [58]:
pipe.score(horror_data,horror_data.author)

0.9066857347157669

In [60]:
horror_data_test = pd.read_csv('Data/HorrorAuthor/test.csv')

In [62]:
horror_data_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...
