<a href="https://colab.research.google.com/github/chebbin/datasci_9_data_prep/blob/main/scripts/example2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import requests
from sklearn.preprocessing import OrdinalEncoder
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [30]:
df = pd.read_csv('https://healthdata.gov/resource/a8v3-a3m3.csv?$query=SELECT%20district_nces_id%2C%20district_name%2C%20week%2C%20learning_modality%2C%20operational_schools%2C%20student_count%2C%20city%2C%20state%2C%20zip_code%20ORDER%20BY%20week%20ASC')

In [31]:
# drop rows with missing values
df.dropna(inplace=True)
df

Unnamed: 0,district_nces_id,district_name,week,learning_modality,operational_schools,student_count,city,state,zip_code
0,100005,Albertville City,2020-09-06T00:00:00.000,Hybrid,6,5824,Albertville,AL,35950
1,100006,Marshall County,2020-09-06T00:00:00.000,Remote,15,5764,Guntersville,AL,35976
2,100007,Hoover City,2020-09-06T00:00:00.000,Hybrid,18,14061,Hoover,AL,35243
3,100008,Madison City,2020-09-06T00:00:00.000,Remote,11,11695,Madison,AL,35758
4,100011,Leeds City,2020-09-06T00:00:00.000,In Person,4,2076,Leeds,AL,35094
...,...,...,...,...,...,...,...,...,...
995,620850,Lamont Elementary,2020-09-06T00:00:00.000,Remote,4,2951,Lamont,CA,93241
996,620880,Lancaster Elementary,2020-09-06T00:00:00.000,Remote,22,14332,Lancaster,CA,93534
997,620910,Larkspur-Corte Madera,2020-09-06T00:00:00.000,Remote,3,1533,Larkspur,CA,94939
998,621000,Las Virgenes Unified,2020-09-06T00:00:00.000,Remote,16,10886,Calabasas,CA,91302


In [32]:
# Edit the dataframe to include only columns of interest
df_small = df[['week','learning_modality', 'operational_schools', 'student_count', 'zip_code']]
df_small

Unnamed: 0,week,learning_modality,operational_schools,student_count,zip_code
0,2020-09-06T00:00:00.000,Hybrid,6,5824,35950
1,2020-09-06T00:00:00.000,Remote,15,5764,35976
2,2020-09-06T00:00:00.000,Hybrid,18,14061,35243
3,2020-09-06T00:00:00.000,Remote,11,11695,35758
4,2020-09-06T00:00:00.000,In Person,4,2076,35094
...,...,...,...,...,...
995,2020-09-06T00:00:00.000,Remote,4,2951,93241
996,2020-09-06T00:00:00.000,Remote,22,14332,93534
997,2020-09-06T00:00:00.000,Remote,3,1533,94939
998,2020-09-06T00:00:00.000,Remote,16,10886,91302


In [34]:
# check for variable types
df_small.dtypes

week                   object
learning_modality      object
operational_schools     int64
student_count           int64
zip_code                int64
dtype: object

In [35]:
# change zip code from integer to string
df_small['zip_code'] = df_small['zip_code'].astype(str)
df_small.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['zip_code'] = df_small['zip_code'].astype(str)


week                   object
learning_modality      object
operational_schools     int64
student_count           int64
zip_code               object
dtype: object

In [36]:
# Encode categorical values for columns that are not numerical
enc = OrdinalEncoder()
enc.fit(df_small[['week']])
df_small['week'] = enc.transform(df_small[['week']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['week'] = enc.transform(df_small[['week']])


In [22]:
enc = OrdinalEncoder()
enc.fit(df_small[['zip_code']])
df_small['zip_code'] = enc.transform(df_small[['zip_code']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['zip_code'] = enc.transform(df_small[['zip_code']])


In [None]:
enc = OrdinalEncoder()
enc.fit(df_small[['learning_modality']])
df_small['learning_modality'] = enc.transform(df_small[['learning_modality']])

In [23]:
df_small

Unnamed: 0,week,learning_modality,operational_schools,student_count,zip_code
0,21.0,1.0,6,5824.0,2102.0
1,21.0,0.0,15,5764.0,2108.0
2,21.0,1.0,18,14061.0,2068.0
3,21.0,1.0,11,11695.0,2094.0
4,21.0,1.0,4,2076.0,2054.0
...,...,...,...,...,...
429860,22.0,2.0,1,76.0,10035.0
429861,22.0,2.0,1,510.0,7774.0
429862,22.0,2.0,1,240.0,11052.0
429863,22.0,2.0,1,233.0,9750.0


In [24]:
# Separate the features (X) and the target variable (y)
X = df_small.drop('learning_modality', axis=1)
y = df_small['learning_modality']

In [27]:
# Split the data into training and a temporary set, 80% and 20% respectively
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# Further split the temporary set into validation and test sets, 10% each
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [29]:
# The sizes of each dataset after splitting
train_size = X_train.shape[0]
val_size = X_val.shape[0]
test_size = X_test.shape[0]

train_size, val_size, test_size

(343746, 42968, 42969)