# TextClassification

## Importing required libraries

In [1]:
import pandas as pd

## Loading the Data
source: https://www.kaggle.com/ashishpatel26/sentimental-analysis-nlp

In [2]:
df = pd.read_csv('Datasets/sentiment_data.csv', header=None, names=['Label', 'Text'], sep='\t')

## Data Cleaning and Preparation

### Check the dimensions 

In [3]:
df.shape

(6918, 2)

### Check columns names

In [4]:
df.columns

Index(['Label', 'Text'], dtype='object')

In [5]:
df.sample(10)

Unnamed: 0,Label,Text
3254,1,I love Brokeback Mountain....
1742,1,Mission Impossible 3 was excellent.
5290,0,"Always knows what I want, not guy crazy, hates..."
3211,1,"Anyway, thats why I love "" Brokeback Mountain."
5222,0,I hate Harry Potter.
6496,0,Ok brokeback mountain is such a horrible movie.
308,1,The Da Vinci Code is awesome!!
2490,1,I am going to start reading the Harry Potter s...
5747,0,This quiz sucks and Harry Potter sucks ok bye..
5420,0,"Always knows what I want, not guy crazy, hates..."


### Check for Nulls

In [6]:
df.isnull().values.any()

False

## Machine Learning

In [7]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

### Define Output and Inputs

In [8]:
y = df['Label']
X = df['Text']

### Split dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Vectorizer

In [10]:
tfidf_vect = TfidfVectorizer(max_features=15)

### 1) LogisticRegression

In [11]:
logistic_clf = LogisticRegression(solver='liblinear')

#### Pipeline

In [12]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])

pipeline_model = clf_pipeline.fit(X_train, y_train)

#### Score

In [13]:
y_pred = pipeline_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8995664739884393

#### Save

In [14]:
import pickle

In [15]:
# pickle.dump(pipeline_model, open('model.pkl',  'wb'))

### 2) Decision Tree Classifier

In [16]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

#### Pipeline

In [17]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])

pipeline_model = clf_pipeline.fit(X_train, y_train)

#### Score

In [18]:
y_pred = pipeline_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.898121387283237

#### Save

In [19]:
# pickle.dump(pipeline_model, open('model2.pkl',  'wb'))

### 3) LinearSVC

In [20]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=1000)

#### Pipeline

In [21]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])

pipeline_model = clf_pipeline.fit(X_train, y_train)

#### Score

In [22]:
y_pred = pipeline_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8995664739884393

#### Save

In [24]:
pickle.dump(pipeline_model, open('model.pkl',  'wb'))