In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
import numpy as np

In [3]:
df = pd.read_csv("iphone_purchase_records.csv")
df.head(10)

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
5,Male,27,58000,0
6,Female,27,84000,0
7,Female,32,150000,1
8,Male,25,33000,0
9,Female,35,65000,0


## Data Preprocessing

In [4]:
df.isnull().sum()

Gender             0
Age                0
Salary             0
Purchase Iphone    0
dtype: int64

In [5]:
# Label encoding
#female-0, male-1
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [6]:
df.shape

(400, 4)

In [7]:
X = df.drop(columns=['Purchase Iphone'], axis=1)
y = df['Purchase Iphone']

In [8]:
# Standardization

In [9]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 1.02020406, -1.78179743, -1.49004624],
       [ 1.02020406, -0.25358736, -1.46068138],
       [-0.98019606, -1.11320552, -0.78528968],
       ...,
       [-0.98019606,  1.17910958, -1.46068138],
       [ 1.02020406, -0.15807423, -1.07893824],
       [-0.98019606,  1.08359645, -0.99084367]])

In [10]:
X.shape

(400, 3)

In [11]:
y = y.to_numpy()
y = y.reshape(-1,1)
y

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [12]:
y.shape

(400, 1)

## Data Splitting

In [13]:
print(type(X),type(y))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y, random_state=2)

In [15]:
X_train.shape

(320, 3)

In [16]:
X_test.shape

(80, 3)

## Model training: Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
clf = DecisionTreeClassifier(criterion='gini')

In [19]:
clf.fit(X_train,y_train)

DecisionTreeClassifier()

In [20]:
X_train_predict = clf.predict(X_train)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
training_accuracy = accuracy_score(X_train_predict,y_train)
training_accuracy

0.996875

In [23]:
X_test_pr = clf.predict(X_test)
test_accuracy = accuracy_score(X_test_pr,y_test)
test_accuracy

0.8875

## Prediction System

In [24]:
type(X_test[0][1])

numpy.float64

In [25]:
X_test[0]

array([-0.98019606,  1.94321462, -1.37258681])

In [26]:
X_sample = (1,35,150000)

# changing the input_data to numpy array
X_sample_as_np = np.asarray(X_sample)

# reshape the array as we are predicting for one instance
X_sample_reshaped = X_sample_as_np.reshape(1,-1)

# standardize the input data
final = scaler.transform(X_sample_reshaped)

prediction = clf.predict(final)
print(prediction)

if prediction[0] == 0:
    print("Not an Iphone user :(")
else:
    print("Iphone user :)")

[1]
Iphone user :)


In [27]:
X_sample = (0,25,50000)

# changing the input_data to numpy array
X_sample_as_np = np.asarray(X_sample)

# reshape the array as we are predicting for one instance
X_sample_reshaped = X_sample_as_np.reshape(1,-1)

# standardize the input data
final = scaler.transform(X_sample_reshaped)

prediction = clf.predict(final)
print(prediction)

if prediction[0] == 0:
    print("Not an Iphone user :(")
else:
    print("Iphone user :)")

[0]
Not an Iphone user :(
