In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the data
d19 = pd.read_csv(r"C:\Users\Njula Chakaya\OneDrive\Documents\Masters_Practicals\Collaborative App Devt\Data Files\D19.csv", header=1)

In [11]:
d19.head()

Unnamed: 0,BookingReference,Created Date,Reference,Attendee Status,Attended,DaysSinceFirst
0,B739461,2019-07-16,A802512,Cancelled,,0
1,B739462,2019-07-16,A802513,Attending,Yes,0
2,B739463,2019-07-16,A802514,Attending,Yes,0
3,B739464,2019-07-16,A802515,Attending,No,0
4,B739465,2019-07-16,A802516,Attending,No,0


In [4]:
# Change the Created Date column to datetime object
d19["Created Date"] = pd.to_datetime(d19["Created Date"], format='%d/%m/%Y')

In [5]:
d19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185 entries, 0 to 1184
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   BookingReference  1185 non-null   object        
 1   Created Date      1185 non-null   datetime64[ns]
 2   Reference         1185 non-null   object        
 3   Attendee Status   1185 non-null   object        
 4   Attended          1086 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 46.4+ KB


In [13]:
# Create a DaysSinceFirst column for the model

'''
How this works: it calculates the number of days between a specific date and the earliest date in the data.
eg Assuming 1 Jan 2020 is the earliest date, and  4 Jan 2020 is the second date, there's 3 days between the
two dates.

'''
d19['DaysSinceFirst'] = (d19['Created Date'] - d19['Created Date'].min()).dt.days

d19.tail()

Unnamed: 0,BookingReference,Created Date,Reference,Attendee Status,Attended,DaysSinceFirst
1180,B791099,2019-11-18,A855383,Attending,Yes,125
1181,B791101,2019-11-18,A855385,Attending,Yes,125
1182,B791103,2019-11-18,A855387,Attending,Yes,125
1183,B791104,2019-11-18,A855388,Attending,Yes,125
1184,B791104,2019-11-18,A855389,Attending,Yes,125


In [14]:
# Encode the Attendee Status ie change to numerical values
d19['Encoded'] = d19['Attendee Status'].astype('category').cat.codes

In [15]:
# Split the data into missing and not missing for the Attended column

not_missing = d19.dropna(subset= ['Attended']) # drops all the missing values in that column
missing = d19[d19['Attended'].isna()] # checks for the missing values in the column
not_missing['AttendedEncoded'] = not_missing['Attended'].astype('category').cat.codes # changes yes/no values to numerical values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_missing['AttendedEncoded'] = not_missing['Attended'].astype('category').cat.codes # changes yes/no values to numerical values


In [18]:
# Split into X and y variables

X = not_missing[['DaysSinceFirst', 'Encoded']]
y = not_missing['AttendedEncoded']

In [19]:
# Split into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)

In [21]:
# Predict and check the accuracy score

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Accuracy: 0.7935779816513762


In [22]:
# Predict the missing values
missing_values = missing[['DaysSinceFirst', 'Encoded']]
predicted_values = knn.predict(missing_values)

In [23]:
# Map the predicted endoded data back to the Attended Status
encode_map = dict(enumerate(not_missing['Attended'].astype('category').cat.categories))
predicted_values_map = [encode_map[pred] for pred in predicted_values]

In [24]:
# Update datset with predicted values
missing_indices = missing.index
d19.loc[missing_indices, 'Attended'] = predicted_values_map

In [26]:
d19.sample(10)

Unnamed: 0,BookingReference,Created Date,Reference,Attendee Status,Attended,DaysSinceFirst,Encoded
1181,B791101,2019-11-18,A855385,Attending,Yes,125,0
1111,B789978,2019-11-15,A854237,Attending,Yes,122,0
61,B739523,2019-07-16,A802574,Attending,No,0,0
360,B754005,2019-09-05,A817298,Attending,Yes,51,0
1102,B789946,2019-11-15,A854196,Attending,Yes,122,0
881,B783288,2019-11-08,A847380,Attending,Yes,115,0
93,B739555,2019-07-16,A802606,Attending,Yes,0,0
946,B787503,2019-11-12,A851636,Attending,Yes,119,0
241,B745064,2019-07-30,A808165,Attending,No,14,0
589,B773165,2019-10-21,A836949,Attending,Yes,97,0
