In [1]:
#!/usr/bin/env python
# coding: utf-8

# Train/test split

Author: Jaren Haber

Project: Charter school identities

Institution: University of California, Berkeley, Dept. of Sociology

Date: August 27, 2018

This script splits the full charter data set (including school and community variables, including WEBTEXT) into a training set and test set using an 80/20 split (per Pareto). 

In [39]:
## Import key packages

import pandas as pd # for working with dataframes
import gc # For managing garbage collector (to increase efficiency loading large files into memory)
from sklearn.model_selection import train_test_split # For splitting into train/test set
import sys # For terminal tricks

In [29]:
## Define file paths

charters_path = "../../charters_full_2015_15_250_counts.pkl" # All text data; only charter schools (regardless if open or not)
train_path = "../../nowdata/traincf_2015_15_250_counts.pkl" # Training set (80% of data)
test_path = "../../nowdata/ignore/testcf_2015_15_250_counts.pkl"

In [4]:
# Load charter data into DF

gc.disable() # disable garbage collector
df = pd.read_pickle(charters_path)
gc.enable() # enable garbage collector again

In [38]:
# Split data into 80% for training and 20% for test using random sample

proptest = 0.2 # 20% test set
print("Creating " + str((1-proptest)*100) + "/" + str(proptest*100) + " train/test split...")
traindf, testdf = train_test_split(df, test_size = proptest, random_state = 0)

Creating 80.0/20.0 train/test split...


In [11]:
print(traindf.shape)
print(testdf.shape)

(9216, 401)
(2305, 401)


In [27]:
# Proportion of full data set with no WEBTEXT:
len(df[df.WEBTEXT.apply(len)==0])/len(df)

0.3577814425831091

In [25]:
# Proportion of training set with no WEBTEXT:
len(traindf[traindf.WEBTEXT.apply(len)==0])/len(traindf)

0.3615451388888889

In [26]:
# Proportion of test set with no WEBTEXT:
len(testdf[testdf.WEBTEXT.apply(len)==0])/len(testdf)

0.34273318872017355

In [30]:
# Save data for later use

traindf.to_pickle(train_path)
testdf.to_pickle(test_path)

PermissionError: [Errno 13] Permission denied: '../../nowdata/traincf_2015_15_250_counts.pkl'

In [None]:
sys.exit() # Kill script when done, just to be safe