# Text classification using TF-IDF

### 1. Load the dataset from sklearn.datasets

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

### 2. Training data

In [3]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

### 3. Test data

In [4]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

###  a.  You can access the values for the target variable using .target attribute 
###  b. You can access the name of the class in the target variable with .target_names


In [5]:
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2])

In [6]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [7]:
twenty_train.data[0:2]

['From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n',
 "From: ani@ms.uky.edu (Aniruddha B. Deglurkar)\nSubject: help: Splitting a trimming region along a mesh \nOrganization: University Of Kentucky, Dept. of Math Sciences\nLines: 28\n\n\n\n\tHi,\n\n\tI have a problem, I hope some of the 'gurus' can help me solve.\n\n\tBackground of the probl

In [8]:
import re
import string

def processData(items):
  newitem = []
  for item in items:
    i = item.replace("\n", " ").replace("\t"," ").replace("\"","").replace("'","").replace(":","") \
    .replace("?","").replace("(","").replace(")","").replace(".","").replace("-","").replace(",","") \
    .replace("  "," ").lower()
    i = re.sub('[^0-9a-z #+_]',' ',i)
    i = re.sub(' +', ' ',i)
    i = re.sub('_+',' ',i)
    i = i.strip()
    newitem.append(i)
  return newitem

In [9]:
X_train = processData(twenty_train.data)

In [10]:
X_train[0:2]

['from sd345 cityacuk michael collier subject converting images to hp laserjet iii nntppostinghost hampton organization the city university lines 14 does anyone know of a good way standard pc application pd utility to convert tif img tga files into laserjet iii format we would also like to do the same converting to hpgl hp plotter files please email any response is this the correct group thanks in advance michael michael collier programmer the computer unit email mpcollier ukaccity the city university tel 071 4778000 x3769 london fax 071 4778565 ec1v 0hb',
 'from ani msukyedu aniruddha b deglurkar subject help splitting a trimming region along a mesh organization university of kentucky dept of math sciences lines 28 hi i have a problem i hope some of the gurus can help me solve background of the problem i have a rectangular mesh in the uv domain ie the mesh is a mapping of a 3d bezier patch into 2d the area in this domain which is inside a trimming loop had to be rendered the trimming 

In [11]:
import pandas as pd

df = pd.DataFrame(
    {'text': X_train,
     'label': twenty_train.target,
    })

df.head()

Unnamed: 0,text,label
0,from sd345 cityacuk michael collier subject co...,1
1,from ani msukyedu aniruddha b deglurkar subjec...,1
2,from djohnson csucsdedu darin johnson subject ...,3
3,from s0612596 letrugnl mm zwart subject cathol...,3
4,from stanly grok11columbiascncrcom stanly subj...,3


###Processing Test Data

In [12]:
twenty_test.target

array([2, 2, 2, ..., 2, 2, 1])

In [13]:
twenty_test.data[0:2]

["From: brian@ucsd.edu (Brian Kantor)\nSubject: Re: HELP for Kidney Stones ..............\nOrganization: The Avant-Garde of the Now, Ltd.\nLines: 12\nNNTP-Posting-Host: ucsd.edu\n\nAs I recall from my bout with kidney stones, there isn't any\nmedication that can do anything about them except relieve the pain.\n\nEither they pass, or they have to be broken up with sound, or they have\nto be extracted surgically.\n\nWhen I was in, the X-ray tech happened to mention that she'd had kidney\nstones and children, and the childbirth hurt less.\n\nDemerol worked, although I nearly got arrested on my way home when I barfed\nall over the police car parked just outside the ER.\n\t- Brian\n",
 'From: rind@enterprise.bih.harvard.edu (David Rind)\nSubject: Re: Candida(yeast) Bloom, Fact or Fiction\nOrganization: Beth Israel Hospital, Harvard Medical School, Boston Mass., USA\nLines: 37\nNNTP-Posting-Host: enterprise.bih.harvard.edu\n\nIn article <1993Apr26.103242.1@vms.ocom.okstate.edu>\n banschbach@

In [14]:
X_test = processData(twenty_test.data)

In [15]:
testdf = pd.DataFrame(
    {'text': X_test,
     'label': twenty_test.target,
    })

testdf.head()

Unnamed: 0,text,label
0,from brian ucsdedu brian kantor subject re hel...,2
1,from rind enterprisebihharvardedu david rind s...,2
2,from adwright iastateedu subject re centi and ...,2
3,from livesey solntzewpdsgicom jon livesey subj...,0
4,from jhpb sartobuddlakenjus joseph h buehler s...,3


### 4.  Now with dependent and independent data available for both train and test datasets, using TfidfVectorizer fit and transform the training data and test data and get the tfidf features for both

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vectorizer = TfidfVectorizer(min_df=3, stop_words='english', ngram_range=(1, 3))
train = vectorizer.fit_transform(df.text)

In [18]:
test = vectorizer.transform(testdf.text)

### 5. Use logisticRegression with tfidf features as input and targets as output and train the model and report the train and test accuracy score

In [19]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)

In [20]:
clf.fit(train, df.label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
print(clf.score(test, testdf.label) * 100)

91.54460719041279


In [22]:
predection = clf.predict(test)

In [23]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(testdf.label, predection) *100
print("Accuracy using LogisticRegression {}".format(float(str(round(acc, 2)))))

Accuracy using LogisticRegression 91.54


In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rfclf = RandomForestClassifier(n_estimators=400 ,max_depth=400, random_state=0)
rfclf.fit(train, df.label)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=400, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
rfpredict = rfclf.predict(test)

In [27]:
acc = accuracy_score(testdf.label, rfpredict) *100
print("Accuracy using RandomForest {}".format(float(str(round(acc, 2)))))

Accuracy using RandomForest 81.23
