In [18]:
%%time
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables (for label encoding)
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn import linear_model

# File system management
# for creating and removing a directory (folder), fetching its contents, changing and identifying the current directory, etc
import os
import sys

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
# seaborn is a library for making statistical graphics in Python
import seaborn as sns

# Read train data
ab_train = pd.read_csv('train.csv')

# Read test data
ab_test = pd.read_csv('test.csv')

## Converting Categorical Features

# Cleaning up Price's currency data with pandas
ab_train['Price'] = ab_train['Price'].replace({'\$':'',',':''},regex=True).astype('float')
ab_test['Price'] = ab_test['Price'].replace({'\$':'',',':''},regex=True).astype('float')

## Encoding Categorical Vairables

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in ab_train:
    if ab_train[col].dtype == 'object':
        unique_list = list(ab_train[col].unique())
        unique_list = [x for x in unique_list if pd.isnull(x)==False]
        # If 2 or fewer unique categories
        if len(unique_list) <= 2:
            # Train on the training data
            le.fit(ab_train[col])
            # Transform both training and testing data
            ab_train[col] = le.transform(ab_train[col])
            ab_test[col] = le.transform(ab_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            print(col)
print('%d columns were label encoded.' % le_count)

# one-hot encoding of categorical variables

ab_train = pd.get_dummies(ab_train)
ab_test = pd.get_dummies(ab_test)

print('Training Features shape: ', ab_train.shape)
print('Testing Features shape: ', ab_test.shape)

ab_train.fillna(ab_train.mean(), inplace = True)
ab_test.fillna(ab_test.mean(), inplace = True)

## Aligining Training and Testing Data

train_labels = ab_train['Decision']

# Align the training and testing data, keep only columns present in both dataframes
ab_train, ab_test = ab_train.align(ab_test, join = 'inner', axis = 1)

# Add the target back in
ab_train['Decision'] = train_labels

## Imputing missing values for sklearn decision tree

# retrieve the numpy array
train_values = ab_train.values
test_values = ab_test.values

#degine the imputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
transformed_train_values = imp.fit_transform(train_values)
transformed_test_values = imp.fit_transform(test_values)


# Set up the train data with columns that are in use
X = ab_train.drop(['Decision','id'], axis=1)
y = ab_train['Decision']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)

X_test_test = ab_test.drop(['id'], axis=1)
y_pred = classifier.predict(X_test_test)

id = ab_test['id'].values.tolist()
df = pd.DataFrame({'id':id,'Decision':y_pred})

#df.to_csv('result_KNN.csv', index = False)

Host_is_superhost
Host_has_profile_pic
Host_identity_verified
Instant_bookable
4 columns were label encoded.
Training Features shape:  (7471, 107)
Testing Features shape:  (2440, 89)




Wall time: 1.31 s


In [6]:
# Read train data
ab_train = pd.read_csv('train.csv')

# Read test data
ab_test = pd.read_csv('test.csv')

In [7]:
## Converting Categorical Features

# Cleaning up Price's currency data with pandas
ab_train['Price'] = ab_train['Price'].replace({'\$':'',',':''},regex=True).astype('float')
ab_test['Price'] = ab_test['Price'].replace({'\$':'',',':''},regex=True).astype('float')

In [8]:
## Encoding Categorical Vairables

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in ab_train:
    if ab_train[col].dtype == 'object':
        unique_list = list(ab_train[col].unique())
        unique_list = [x for x in unique_list if pd.isnull(x)==False]
        # If 2 or fewer unique categories
        if len(unique_list) <= 2:
            # Train on the training data
            le.fit(ab_train[col])
            # Transform both training and testing data
            ab_train[col] = le.transform(ab_train[col])
            ab_test[col] = le.transform(ab_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            print(col)
print('%d columns were label encoded.' % le_count)

Host_is_superhost
Host_has_profile_pic
Host_identity_verified
Instant_bookable
4 columns were label encoded.


In [9]:
# one-hot encoding of categorical variables

ab_train = pd.get_dummies(ab_train)
ab_test = pd.get_dummies(ab_test)

print('Training Features shape: ', ab_train.shape)
print('Testing Features shape: ', ab_test.shape)

ab_train.fillna(ab_train.mean(), inplace = True)
ab_test.fillna(ab_test.mean(), inplace = True)

Training Features shape:  (7471, 107)
Testing Features shape:  (2440, 89)


In [10]:
## Aligining Training and Testing Data

train_labels = ab_train['Decision']

# Align the training and testing data, keep only columns present in both dataframes
ab_train, ab_test = ab_train.align(ab_test, join = 'inner', axis = 1)

# Add the target back in
ab_train['Decision'] = train_labels

In [11]:
## Imputing missing values for sklearn decision tree

# retrieve the numpy array
train_values = ab_train.values
test_values = ab_test.values

#degine the imputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
transformed_train_values = imp.fit_transform(train_values)
transformed_test_values = imp.fit_transform(test_values)


# Set up the train data with columns that are in use
X = ab_train.drop(['Decision','id'], axis=1)
y = ab_train['Decision']

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)

from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)
print(ac)

0.6595317725752509


In [15]:
X_test_test = ab_test.drop(['id'], axis=1)
y_pred = classifier.predict(X_test_test)
print(y_pred)



[0 0 0 ... 0 0 0]


In [73]:
id = ab_test['id'].values.tolist()
df = pd.DataFrame({'id':id,'Decision':y_pred})
print(df)

df.to_csv('result_KNN.csv', index = False)

        id  Decision
0        1         1
1        2         1
2        3         1
3        4         1
4        5         1
...    ...       ...
2435  2436         1
2436  2437         0
2437  2438         1
2438  2439         0
2439  2440         0

[2440 rows x 2 columns]


In [48]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)
print(ac)

0.6702341137123746


In [16]:
## predict classification with test data

X_test_test = ab_test.drop(['id'], axis=1)
test_pre = classifier.predict(X_test_test)

id = ab_test['id'].values.tolist()
df = pd.DataFrame({'id':id,'Decision':test_pre})
print(df)

df.to_csv('result_KNN.csv', index = False)



        id  Decision
0        1         0
1        2         0
2        3         0
3        4         0
4        5         0
...    ...       ...
2435  2436         0
2436  2437         0
2437  2438         0
2438  2439         0
2439  2440         0

[2440 rows x 2 columns]


In [None]:
0.39344