In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rishubh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rishubh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rishubh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
np.random.seed(42)
grid = {
    "n_estimators":np.arange(10,100,10),
    "max_depth":[None,3,5,10],
    "min_samples_split":np.arange(2,20,2),
    "min_samples_leaf":np.arange(1,20,2),
    "max_features": [0.5,1,"sqrt","auto"],
    "max_samples":[10000,4000,15000,4000]
}

In [35]:
X_train = pd.read_csv('./data/train_texts.csv', header=0, sep=',', quotechar='"')
Y_train = pd.read_csv('./data/train_labels.csv', header=0, sep=',', quotechar='"')
X_test = pd.read_csv('./data/test_texts.csv', header=0, sep=',', quotechar='"')
Y_test = pd.read_csv('./data/test_labels.csv', header=0, sep=',', quotechar='"')
X_train = X_train.head(6000)
Y_train = Y_train.head(6000)
X_test = X_test.head(2000)
Y_test = Y_test.head(2000)

X_train.head()

Unnamed: 0,train_texts
0,german & dutch customer services administrator...
1,"account director berkshire permanent pr, adve..."
2,desktop support analyst belfast contract hays ...
3,lead ccie consultant gold partner finance the ...
4,rgn / rmn nurse uk permanent cvbrowser social ...


In [36]:
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    lower = [word.lower() for word in tokens]
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    counter = Counter(clean_text)
    most_occur = counter.most_common(40)
    most_occuring_words = [a_tuple[0] for a_tuple in most_occur]
    clean_text = [i for i in clean_text if i in most_occuring_words]
    return clean_text

In [37]:
def vectorize(data,tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [38]:
tfidf_vect = TfidfVectorizer(analyzer=clean)
tfidf_vect_fit=tfidf_vect.fit(X_train['train_texts'])
X_train=vectorize(X_train['train_texts'],tfidf_vect_fit)
X_train.head()

#tfidf_vect = TfidfVectorizer(analyzer=clean)
#tfidf_vect_fit=tfidf_vect.fit(X_test['test_texts'])
X_test=vectorize(X_test['test_texts'],tfidf_vect_fit)
X_test.head()

Unnamed: 0,aa,aar,aarca,aaron,aasl,aat,ab,abacus,abae,abaes,...,zend,zenith,zero,zest,zigbee,zoe,zone,zorba,zouch,ƒx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
model = RandomizedSearchCV(
RandomForestRegressor(n_jobs=-1,
                     random_state=42),
                    param_distributions = grid,
                     n_iter=5,
                    cv=5,
                    verbose=True)

In [40]:
model.fit(X_train,Y_train.values.ravel())

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestRegressor(n_jobs=-1, random_state=42),
                   n_iter=5,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [10000, 4000, 15000,
                                                        4000],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([10, 20, 30, 40, 50, 60, 70, 80, 90])},
                   verbose=True)

In [41]:
y_preds = model.predict(X_test)

In [42]:
mae_hyp = mean_absolute_error(Y_test,y_preds)

In [43]:
mae_hyp

9332.271056149542