In [2]:
from io import StringIO
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))
%precision 2

u'%.2f'

In [3]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''

In [4]:
csv_data = unicode(csv_data) # not needed for python 3.x
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


###### Print the count of null / np.nan values per row and per column

In [5]:
# over columns, number of missings
#creates True/False data frame
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [6]:
# over rows, number of missings
#every pandas method has a default behavior
#you have to put axis=1 because the default is 0
df.isnull().sum(axis=1)

0    0
1    1
2    1
dtype: int64

##### Show the underlying numpy array

In [7]:
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,   8.],
       [  0.,  11.,  12.,  nan]])

##### Drop missing values (1) per row and (2) per column

In [8]:
df.dropna()
#drops any time there's an NA in a row

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
df.dropna(axis=1)
#drops any time there's an NA in a column

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


##### What are the effects of the optional parameters of `.dropna()`?

##### Answer:

##### Use [sklearn.preprocessing.Imputer](http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values) to replace the missing values by the column mean.

In [10]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) #if only one value is missing, you can just fill that spot with the mean of other values in that column
#axis=0 is the column; axis=1 is the row
imp = imp.fit(df)
imputed_data = imp.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])

In [11]:
pd.DataFrame(imputed_data)
#if you want to see it as a DataFrame rather than the array above

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,0.0,11.0,12.0,6.0


##### Use [sklearn.preprocessing.scale](http://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling) to standardize the data.

In [12]:
from sklearn.preprocessing import scale
scale(imputed_data)
#scale is normalizing the data; subtracting the mean from the observations
#Phillippa says use StandardScaler rather than this version of scale

array([[-0.46, -1.18, -1.22, -1.22],
       [ 1.39, -0.09,  0.  ,  1.22],
       [-0.93,  1.27,  1.22,  0.  ]])

##### Show mean and standard deviation after the last transformation.

In [13]:
pd.DataFrame(scale(imputed_data), columns = ["A","B","C","D"]).describe()

Unnamed: 0,A,B,C,D
count,3.0,3.0,3.0,3.0
mean,0.0,0.0,0.0,0.0
std,1.22,1.22,1.22,1.22
min,-0.93,-1.18,-1.22,-1.22
25%,-0.69,-0.63,-0.61,-0.61
50%,-0.46,-0.09,0.0,0.0
75%,0.46,0.59,0.61,0.61
max,1.39,1.27,1.22,1.22


##### Create a DataFrame with 3 columns with labels 'y', 'x1' and 'x2', and 100 rows of random integers in [-20, 80].

In [24]:
df = pd.DataFrame(np.random.randint(-20, 80, (100, 3)), columns=['y', 'x1', 'x2'])
df.head()
df.tail()

Unnamed: 0,y,x1,x2
95,-17,59,66
96,-2,66,14
97,12,2,79
98,33,33,23
99,59,46,7


##### Use [sklearn.crossvalidation](http://scikit-learn.org/stable/modules/cross_validation.html) to split the DataFrame into a train set with 80 rows and a test set with 20 rows

In [16]:
from sklearn.cross_validation import train_test_split
df = df.astype(float)
X, y = df.loc[:, ['x1', 'x2']].values, df.loc[:, 'y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#train_test_split makes 4 diff arrays: x train = 80 rows, x test is 20 rows, y_train
#random_state 

##### Use [sklearn.preprocessing.StandardScaler](http://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling) to standardize both train and test data

In [17]:
# fit to train, implement on test
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
#subtracting mean and dividing by std
#.fit_transform is same as .fit
#you should never fit models on test data, because theoretically you've never seen it

##### Show mean and standard deviations for all subsets

In [18]:
X_train_std.mean()

0.00

In [19]:
X_train_std.std()

1.00

In [26]:
#now we look for the mean on the test data
#these next two lines are pulling the mean and std from the train data set
X_test_std.mean()

0.09

In [27]:
X_test_std.std()

1.06

##### What's the difference between StandardScaler and scale, and why does it matter? 

##### Answer: 
????

##### Create a new column for the below DataFrame that translates the sizes into suitable integer values using `.map()`.

In [20]:
df = pd.DataFrame([
['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']], 
columns=['color', 'size', 'price', 'class'])
df

Unnamed: 0,color,size,price,class
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [21]:
mapper = {"M":1, "L":2, "XL":3}

In [22]:
df['int size'] = df['size'].map(mapper)

In [23]:
df.head()

Unnamed: 0,color,size,price,class,int size
0,green,M,10.1,class1,1
1,red,L,13.5,class2,2
2,blue,XL,15.3,class1,3
