In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

## Titanic

In [2]:
train = pd.read_csv('Datasets/train.csv')
test = pd.read_csv('Datasets/test.csv')
test_ = pd.read_csv('Datasets/test_.csv')

In [3]:
df = pd.concat([train, test], axis=0, ignore_index=True)

In [4]:
test_.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [5]:
df = pd.merge(left=df, right=test_, on='PassengerId', how='left').reset_index(drop=True)

In [6]:
df['Survived'] = [int(df.loc[i, 'Survived_y']) if pd.isna(df.loc[i, 'Survived_x']) == True 
     else int(df.loc[i, 'Survived_x']) for i in range(df.shape[0])]

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived_x,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived_y,Survived
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,,0


In [8]:
df.shape

(1309, 14)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived_x   891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
 12  Survived_y   418 non-null    float64
 13  Survived     1309 non-null   int64  
dtypes: float64(4), int64(5), object(5)
memory usage: 143.3+ KB


In [10]:
idx = df[df['Age'].isna()].index
df.drop(idx, inplace=True)
df.reset_index(drop=True, inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1046 non-null   int64  
 1   Survived_x   714 non-null    float64
 2   Pclass       1046 non-null   int64  
 3   Name         1046 non-null   object 
 4   Sex          1046 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1046 non-null   int64  
 7   Parch        1046 non-null   int64  
 8   Ticket       1046 non-null   object 
 9   Fare         1045 non-null   float64
 10  Cabin        272 non-null    object 
 11  Embarked     1044 non-null   object 
 12  Survived_y   332 non-null    float64
 13  Survived     1046 non-null   int64  
dtypes: float64(4), int64(5), object(5)
memory usage: 114.5+ KB


In [12]:
df['Ticket']

0                A/5 21171
1                 PC 17599
2         STON/O2. 3101282
3                   113803
4                   373450
               ...        
1041    SOTON/O.Q. 3101315
1042                 19928
1043                347086
1044              PC 17758
1045    SOTON/O.Q. 3101262
Name: Ticket, Length: 1046, dtype: object

In [13]:
cols = ['PassengerId', 'Survived_x', 'Survived_y', 'Name', 'Ticket', 'Cabin']
df.drop(columns=cols, inplace=True)

In [14]:
class1 = df.loc[df['Pclass'] == 1, ['Pclass', 'Fare']].mean()
class2 = df.loc[df['Pclass'] == 2, ['Pclass', 'Fare']].mean()
class3 = df.loc[df['Pclass'] == 3, ['Pclass', 'Fare']].mean()
print(class1, class2, class3, sep='\n')

Pclass     1.000000
Fare      92.229358
dtype: float64
Pclass     2.000000
Fare      21.855044
dtype: float64
Pclass     3.000000
Fare      12.879299
dtype: float64


In [15]:
corr = df.corr()
corr[(corr > 0.5) | (corr < -0.5)]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
Pclass,1.0,,,,-0.565255,
Age,,1.0,,,,
SibSp,,,1.0,,,
Parch,,,,1.0,,
Fare,-0.565255,,,,1.0,
Survived,,,,,,1.0


In [16]:
corr.loc[['Pclass', 'Fare'],['Survived']]

Unnamed: 0,Survived
Pclass,-0.28208
Fare,0.239842


In [17]:
df.drop(columns=['Fare'], inplace=True)

In [18]:
df = pd.get_dummies(df, drop_first=True)

In [19]:
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Survived,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,0,1,0,1
1,1,38.0,1,0,1,0,0,0
2,3,26.0,0,0,1,0,0,1
3,1,35.0,1,0,1,0,0,1
4,3,35.0,0,0,0,1,0,1


In [20]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [21]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,1,0,1
1,1,38.0,1,0,0,0,0
2,3,26.0,0,0,0,0,1
3,1,35.0,1,0,0,0,1
4,3,35.0,0,0,1,0,1


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
X_train.shape

(836, 7)

In [24]:
X_test.shape

(210, 7)

In [25]:
clf = GaussianNB().fit(X_train, y_train)

In [26]:
print('accuracy train:', round(clf.score(X_train, y_train)*100, 2))
print('accuracy test:', round(clf.score(X_test, y_test)*100, 2))

accuracy train: 84.45
accuracy test: 84.29


In [27]:
clf.predict(X_test)[:5]

array([0, 0, 1, 1, 0])

In [28]:
clf.predict_proba(X_test)[:5]

array([[0.95420943, 0.04579057],
       [0.95775044, 0.04224956],
       [0.05488577, 0.94511423],
       [0.21994457, 0.78005543],
       [0.57768795, 0.42231205]])

## Email Spam Detection

In [29]:
df1 = pd.read_csv('spam.csv')

In [30]:
df1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
df1.loc[0, 'Message']

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [32]:
df1.shape

(5572, 2)

In [33]:
df1['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [34]:
df1.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [35]:
df1 = pd.get_dummies(data=df1, columns=['Category'], drop_first=True)

In [36]:
df1.rename(columns={'Category_spam':'Spam'}, inplace=True)

In [37]:
df1.index

RangeIndex(start=0, stop=5572, step=1)

In [38]:
X = df1['Message']
y = df1['Spam']

In [39]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Spam, dtype: uint8

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [41]:
X_train.head()

1114    No I'm good for the movie, is it ok if I leave...
3589    If you were/are free i can give. Otherwise nal...
3095    Have you emigrated or something? Ok maybe 5.30...
1012          I just got home babe, are you still awake ?
3320                      Kay... Since we are out already
Name: Message, dtype: object

In [42]:
vectorizer = CountVectorizer(stop_words=stopwords).fit(X_train)

In [43]:
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [44]:
clf2 = MultinomialNB().fit(X_train_vec, y_train)

In [45]:
print('accuracy train:', round(clf2.score(X_train_vec, y_train)*100, 2))
print('accuracy test:', round(clf2.score(X_test_vec, y_test)*100, 2))

accuracy train: 99.35
accuracy test: 98.65


In [46]:
X_test[:5]

4456    Storming msg: Wen u lift d phne, u say "HELLO"...
690     <Forwarded from 448712404000>Please CALL 08712...
944     And also I've sorta blown him off a couple tim...
3768                  Sir Goodmorning, Once free call me.
1189    All will come alive.better correct any good lo...
Name: Message, dtype: object

In [47]:
clf2.predict(X_test_vec)[:5]

array([0, 1, 0, 0, 0], dtype=uint8)

In [48]:
clf2.predict_proba(X_test_vec)[:5]

array([[1.00000000e+00, 1.09071517e-11],
       [3.08334631e-02, 9.69166537e-01],
       [9.99959863e-01, 4.01374573e-05],
       [9.16033641e-01, 8.39663588e-02],
       [9.99983223e-01, 1.67766818e-05]])