In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,f1_score,classification_report

#Importing the Affairs Dataset

In [3]:
Affairs=pd.read_csv('datasets_3452_5576_Affairs.csv')

In [4]:
Affairs.sample(5)

Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
463,174,12,female,42.0,15.0,yes,5,9,4,1
548,1236,7,male,42.0,15.0,yes,3,20,5,4
135,667,0,female,32.0,15.0,yes,3,18,5,4
488,423,7,male,52.0,15.0,yes,2,20,6,4
28,153,0,male,27.0,4.0,yes,3,16,5,5


In [5]:
Affairs.affairs.unique()

array([ 0,  3,  7, 12,  1,  2], dtype=int64)

As we have more than 2 fields, we will convert this into a binary classification problem. 

Someone having more than 1 affair, we will consider as 1(Having affair)& with zero affairs , will consider 0. 

In [7]:
Affairs['affair']=Affairs['affairs'].apply(lambda x:1 if x>=1 else 0)

In [8]:
#Dropping the original column & the unnecessary column "Unamed " now after creating the transformed columns

In [9]:
Affairs.drop(columns=['affairs','Unnamed: 0'],inplace=True)

In [10]:
Affairs.head()

Unnamed: 0,gender,age,yearsmarried,children,religiousness,education,occupation,rating,affair
0,male,37.0,10.0,no,3,18,7,4,0
1,female,27.0,4.0,no,4,14,6,4,0
2,female,32.0,15.0,yes,1,12,1,4,0
3,male,57.0,15.0,yes,5,18,6,5,0
4,male,22.0,0.75,no,2,17,6,3,0


#Transforming the gender & children variables from categorical to numeric variables. 
#Gender - Male -1 & Female - 0
#Children - Yes - 1 & No - 0

In [12]:
Affairs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         601 non-null    object 
 1   age            601 non-null    float64
 2   yearsmarried   601 non-null    float64
 3   children       601 non-null    object 
 4   religiousness  601 non-null    int64  
 5   education      601 non-null    int64  
 6   occupation     601 non-null    int64  
 7   rating         601 non-null    int64  
 8   affair         601 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 42.4+ KB


In [13]:
Affairs['gender']=Affairs['gender'].map({'male':1,'female':0})

In [14]:
Affairs['children']=Affairs['children'].map({'yes':1,'no':0})

In [15]:
Affairs.dtypes

gender             int64
age              float64
yearsmarried     float64
children           int64
religiousness      int64
education          int64
occupation         int64
rating             int64
affair             int64
dtype: object

In [16]:
Affairs.head(3)

Unnamed: 0,gender,age,yearsmarried,children,religiousness,education,occupation,rating,affair
0,1,37.0,10.0,0,3,18,7,4,0
1,0,27.0,4.0,0,4,14,6,4,0
2,0,32.0,15.0,1,1,12,1,4,0


In [17]:
X=Affairs.drop(['affair'],axis=1)

In [18]:
Y=Affairs['affair']

#Dividing the dataset into train and test

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=345)

In [22]:
Logis_affair_model=LogisticRegression()

In [23]:
Logis_affair_model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#Looks like we need to scale the model and hence Scaling & then building a Logistic regression model. 

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler=StandardScaler()

In [27]:
X_scaled=scaler.fit_transform(X)

In [28]:
X_scaled=pd.DataFrame(X_scaled,columns=X.columns)

In [29]:
TrX,TeX,TrY,TeY=train_test_split(X_scaled,Y,test_size=.3,random_state=478)

In [30]:
Log_aff_model=LogisticRegression()

In [31]:
Log_aff_model.fit(TrX,TrY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

##Training accuracy

In [33]:
Acc_train=pd.DataFrame({'Actual':TrY,'Pred':Log_aff_model.predict(TrX)})

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(Acc_train.Actual,Acc_train.Pred)

0.7785714285714286

##Testing Accuracy

In [37]:
Acc_test=pd.DataFrame({'Actual':TeY,'pred':Log_aff_model.predict(TeX)})

In [38]:
accuracy_score(Acc_test.Actual,Acc_test.pred)

0.7403314917127072

#We have a stable model as both Training & Testing accuracy are almost same. 

#Confusion Matrix on Training Data

In [41]:
confusion_matrix(Acc_train.Actual,Acc_train.Pred)

array([[305,  10],
       [ 83,  22]], dtype=int64)

#Confusion Matrix on Testing  Data

In [43]:
confusion_matrix(Acc_test.Actual,Acc_test.pred)

array([[129,   7],
       [ 40,   5]], dtype=int64)

#Classification Report based on Training Data

In [45]:
print(classification_report(Acc_train.Actual,Acc_train.Pred))

              precision    recall  f1-score   support

           0       0.79      0.97      0.87       315
           1       0.69      0.21      0.32       105

    accuracy                           0.78       420
   macro avg       0.74      0.59      0.59       420
weighted avg       0.76      0.78      0.73       420



#Checking with Crossvalidation as well. 

In [47]:
from sklearn.model_selection import cross_val_score

In [48]:
cross_val_score(Log_aff_model,X_scaled,Y,cv=10).mean()

0.7554371584699453

#Cross validation is giving similar accuracy as well. 

#Hence, we will go with this model & save it in pickle file for deployment. 

In [80]:
import pickle

In [83]:
with open('extra_affair.pickle','wb') as f:
    pickle.dump(Log_aff_model,f)

In [85]:
with open('scale.pickle','wb')as f:
    pickle.dump(scaler,f)

#Loading the pickle file just to test its working. 

In [86]:
with open('extra_affair.pickle','rb')as f:
    m=pickle.load(f)

In [88]:
with open('scale.pickle','rb')as f:
    scaler=pickle.load(f)

In [94]:
if m.predict(scaler.fit_transform([[1,37,10,0,3,18,7,4]]))[0]==0:
    
    print('No Affair')
    
else:
    
    print('Affair')

No Affair
