-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimal.py
54 lines (37 loc) · 1.46 KB
/
optimal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from nlp import dataf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#baseline
print(dataf['subreddit'].value_counts(normalize = True))
#Predictor (x) and target (y)
x = dataf['title']
y = dataf['subreddit']
xtrain, xtest, ytrain, ytest = train_test_split(x,y, random_state=42, stratify = y)
"""
Finding best parameters for Multinomial Naive Bayes Classifier
"""
#GridSearchCV
pipe = Pipeline([('vect', CountVectorizer()), ('nb', MultinomialNB())])
params = {'vect__ngram_range':[(1,1),(1,3)], 'nb__alpha':[0.36, 0.6]}
gs = GridSearchCV(pipe, param_grid = params, cv = 3)
gs.fit(xtrain,ytrain)
print("best score: ",gs.best_score_)
print("train score: ", gs.score(xtrain,ytrain))
print("test score: ",gs.score(xtest,ytest))
print(gs.best_params_)
"""
Find best parameters for Logistic Regression
"""
#GridSearchCV
pipe = Pipeline([('vect', CountVectorizer()), ('lr', LogisticRegression(solver = 'liblinear'))])
params = {'vect__ngram_range':[(1,1),(1,3)], 'lr__C':[0.01, 1]}
gs = GridSearchCV(pipe, param_grid = params, cv = 3)
gs.fit(xtrain,ytrain)
print("best score: ",gs.best_score_)
print("train score: ", gs.score(xtrain,ytrain))
print("test score: ",gs.score(xtest,ytest))
print(gs.best_params_)