In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file1=os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
email=pd.read_csv(file1,encoding = "latin-1")
email = email[['v1', 'v2']]
email = email.rename(columns = {'v1': 'label', 'v2': 'message'})

In [None]:
email.head()

In [None]:
email.label.value_counts(normalize=True)

In [None]:
email['label']=email['label'].apply(lambda x: 0 if x=='ham' else 1 )

In [None]:
email.label.value_counts(normalize=True)

## Data Preprocessing

#### Saving as Message & Label as a tuple

In [None]:
data=[]
for i,j in email.iterrows():
    data.append((j['message'],j['label']))

In [None]:
data[0:5]

In [None]:
len(data)

#### Creating a preprocessing function

In [None]:
#Preprocessing Libraries 
from  nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

In [None]:
#initialize
stemmer=SnowballStemmer('english')
lemmatizer=WordNetLemmatizer()

In [None]:
import re
#cleaning data
cleaned_data=[]

for (i,j) in data:
    result=re.findall('[\w]+',i)
    message=' '.join(result)
    cleaned_data.append((message,j))
    

In [None]:
cleaned_data[0:5]

In [None]:
def preprocessing(document,stem=True):
    
    words=document.lower() 
    
    words=word_tokenize(words)
    
    words=[i for i in words if i not in stopwords.words('english')]
    
    if stem:
        words=[stemmer.stem(i) for i in words]
    else:
        words=[lemmatizer.lemmatize(i) for i in words]
    
    new_document=' '.join(words)
    
    return new_document
    
    

In [None]:
dataset=[]
for (i,j) in cleaned_data:
    x=preprocessing(i,stem=False)
    dataset.append((x,j))

In [None]:
df = pd.DataFrame(dataset, columns =['message','label'])

In [None]:
df.head()

### Creating Train and Test Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.2, random_state = 1)

In [None]:
print(len(X_train)), print(len(X_test)), print(len(y_train)), print(len(y_test))

In [None]:
X_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize= TfidfVectorizer()
X_train_trans=vectorize.fit_transform(X_train)

###  Buiding the model

In [None]:
#svm
from sklearn.svm import SVC
svm = SVC(C=1000)
svm=svm.fit(X_train_trans, y_train)

In [None]:
#logistic regression model 
from sklearn.ensemble import RandomForestClassifier
ran_tree = RandomForestClassifier().fit(X_train_trans, y_train)


In [None]:
pd.DataFrame(zip(vectorize.get_feature_names(),ran_tree.feature_importances_,)).sort_values(by=1, ascending=False).head(10)

In [None]:
#logistic regression model 
from sklearn.linear_model import LogisticRegression
log = LogisticRegression().fit(X_train_trans, y_train)

### Evaluation

In [None]:
X_test_trans=vectorize.transform(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,ran_tree.predict(X_test_trans)))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,log.predict(X_test_trans)))