In [35]:
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder

# Data

In [14]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

train = train.dropna()

In [24]:
features = train.columns.drop(['id', 'category'])
target = 'category'

In [25]:
X_train = train[features]
X_test = test[features]

y_train = train[target]

In [29]:
encoder = OrdinalEncoder()

encoder.fit(X_train['author'].values.reshape(-1, 1))

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [30]:
X_train['author'] = encoder.transform(X_train['author'].values.reshape(-1, 1))
X_test['author'] = encoder.transform(X_test['author'].values.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [38]:
# description

nlp = spacy.load('en_core_web_md')

def tokenize(doc):
    d = nlp(doc)
    tokens = []
    
    # You can extract anything you want from Spacy to use as a feature
    # Noun Chunks aren't the end all be all.
    # Try Extracting just tokens, or just adjectives, or just pronouns
    for chunk in d.noun_chunks:
        tokens.append(chunk.lemma_)
        
    return tokens

vect = CountVectorizer(analyzer=tokenize, max_df=7, min_df=4)

In [40]:
vect.fit(X_train['description'])

CountVectorizer(analyzer=<function tokenize at 0x1a1a06ce18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=7,
        max_features=None, min_df=4, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [41]:
X = vect.transform(X_train['description'])
X = X.todense()
X

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [15]:
# Initialize RFC and Vectorizer
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [16]:
# Define pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [17]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (20, 100, 400)
}

In [18]:
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [19]:
gs.fit(train['description'], train['category'])

KeyboardInterrupt: 

In [None]:
gs.best_estimator_

In [None]:
# Reinitialize RFC and Vectorizer
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

# Redfine pipeline (not sure if necessary)
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [None]:
# Fit pipeline
pipe.fit(train['description'], train['category'])

# Making a Submission

In [28]:
# Test Pipeline
y_pred = pipe.predict(test['description'])

In [29]:
submission = pd.DataFrame({'id': test['id'], 'category':y_pred})
submission['category'] = submission['category'].astype('int64')

In [30]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [31]:
submission.to_csv('submission2.csv', index=False)

In [32]:
df = pd.read_csv('submission2.csv')
df.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1
