<a href="https://colab.research.google.com/github/gauravthombare/gauravthombare/blob/main/Data_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
twenty_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
count_vect = CountVectorizer()

In [7]:
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [8]:
X_train_counts

<11314x130107 sparse matrix of type '<class 'numpy.int64'>'
	with 1787565 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [10]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Developing a pipeline

In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),])

In [12]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

## Testing NB classifier performance

In [13]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)

In [15]:
np.mean(predicted == twenty_test.target)

0.7738980350504514

## Classifier using SVM

In [16]:
from sklearn.linear_model import SGDClassifier

In [35]:
# Pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf_svm', SGDClassifier(loss='hinge', penalty='l2')),])

In [20]:
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

In [21]:
predicted_svm = text_clf_svm.predict(twenty_test.data)

In [22]:
np.mean(predicted_svm == twenty_test.target)

0.8240839086563994

## Grid Search

### Naive Bayes

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
parameters = {'vect__ngram_range':[(1,1),(1,2)],
'tfidf__use_idf':(True, False),
'clf__alpha': (1e-2, 1e-3)}

In [25]:
parameters

{'vect__ngram_range': [(1, 1), (1, 2)],
 'tfidf__use_idf': (True, False),
 'clf__alpha': (0.01, 0.001)}

In [26]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [27]:
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [29]:
gs_clf.best_score_

0.9157684864695698

In [30]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

### SVM 

In [37]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf_svm__alpha':(1e-2, 1e-3),}

In [38]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

In [39]:
gs_clf_svm.best_score_

0.9049852448941239

In [40]:
gs_clf_svm.best_params_

{'clf_svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}