# Extracting a data subset from a data set

In [1]:
import pandas as pd

In [5]:
data = pd.read_csv(r'../datasets/customer-churn-model/Customer Churn Model.txt')

In [6]:
data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


Selecting a serie from the dataset (type = Series):

In [8]:
account_length = data['Account Length']
account_length.head()

0    128
1    107
2    137
3     84
4     75
Name: Account Length, dtype: int64

In [11]:
type(account_length)

pandas.core.series.Series

Selecting multiple series from the dataset (typer = DataFrame):

In [12]:
subset = data[['Account Length', 'Phone', 'Eve Charge', 'Day Calls']]
subset.head()

Unnamed: 0,Account Length,Phone,Eve Charge,Day Calls
0,128,382-4657,16.78,110
1,107,371-7191,16.62,123
2,137,358-1921,10.3,114
3,84,375-9999,5.26,71
4,75,330-6626,12.61,113


In [13]:
type(subset)

pandas.core.frame.DataFrame

An easier way to get a subset:

In [16]:
desired_columns = ['Account Length', 'Phone', 'Eve Charge', 'Night Calls']
subset = data[desired_columns]
subset.head()

Unnamed: 0,Account Length,Phone,Eve Charge,Night Calls
0,128,382-4657,16.78,91
1,107,371-7191,16.62,103
2,137,358-1921,10.3,104
3,84,375-9999,5.26,89
4,75,330-6626,12.61,121


An easier way to get a subset when there are too many subsets (getting the complementary):

In [20]:
desired_columns = ['Account Length', 'VMail Message', 'Day Calls']
desired_columns

['Account Length', 'VMail Message', 'Day Calls']

In [21]:
all_columns_list = data.columns.values.tolist()
all_columns_list

['State',
 'Account Length',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'VMail Message',
 'Day Mins',
 'Day Calls',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']

In [25]:
# A simple trick to get the complementary list of the columns (based on desired columns):
sublist = [x for x in all_columns_list if x not in desired_columns]
sublist

['State',
 'Area Code',
 'Phone',
 "Int'l Plan",
 'VMail Plan',
 'Day Mins',
 'Day Charge',
 'Eve Mins',
 'Eve Calls',
 'Eve Charge',
 'Night Mins',
 'Night Calls',
 'Night Charge',
 'Intl Mins',
 'Intl Calls',
 'Intl Charge',
 'CustServ Calls',
 'Churn?']

In [27]:
subset = data[sublist]
subset.head()

Unnamed: 0,State,Area Code,Phone,Int'l Plan,VMail Plan,Day Mins,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,415,382-4657,no,yes,265.1,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,415,371-7191,no,yes,161.6,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,415,358-1921,no,no,243.4,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,408,375-9999,yes,no,299.4,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,415,330-6626,yes,no,166.7,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [28]:
# Another trick to get the complementary:
a = set(desired_columns)
b = set(all_columns_list)
sublist = b-a
sublist = list(sublist)
subset = data[sublist]
subset

Unnamed: 0,VMail Plan,CustServ Calls,Phone,Eve Mins,Int'l Plan,Night Charge,Day Mins,Eve Charge,Area Code,Intl Charge,State,Churn?,Day Charge,Intl Calls,Night Mins,Intl Mins,Night Calls,Eve Calls
0,yes,1,382-4657,197.4,no,11.01,265.1,16.78,415,2.70,KS,False.,45.07,3,244.7,10.0,91,99
1,yes,1,371-7191,195.5,no,11.45,161.6,16.62,415,3.70,OH,False.,27.47,3,254.4,13.7,103,103
2,no,0,358-1921,121.2,no,7.32,243.4,10.30,415,3.29,NJ,False.,41.38,5,162.6,12.2,104,110
3,no,2,375-9999,61.9,yes,8.86,299.4,5.26,408,1.78,OH,False.,50.90,7,196.9,6.6,89,88
4,no,3,330-6626,148.3,yes,8.41,166.7,12.61,415,2.73,OK,False.,28.34,3,186.9,10.1,121,122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,yes,2,414-4276,215.5,no,12.56,156.2,18.32,415,2.67,AZ,False.,26.55,6,279.1,9.9,83,126
3329,no,3,370-3271,153.4,no,8.61,231.1,13.04,415,2.59,WV,False.,39.29,4,191.3,9.6,123,55
3330,no,2,328-8230,288.8,no,8.64,180.8,24.55,510,3.81,RI,False.,30.74,6,191.9,14.1,91,58
3331,no,2,364-6381,159.6,yes,6.26,213.8,13.57,510,1.35,CT,False.,36.35,10,139.2,5.0,137,84
