# The following is Jake Vanderplas breaking down how to use groupby

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.DataFrame({'x': 10 * np.random.rand(100),
                     'y': 10 * np.random.rand(100)})
data.head()

Unnamed: 0,x,y
0,9.332956,8.647236
1,8.308141,3.23911
2,0.042359,3.17688
3,3.684723,7.26613
4,7.577109,9.306614


In [None]:
def get_outliers(x):
    mu = x.mean()
    std = x.std()
    return x[(x < mu - std) | (x > mu + std)]

result = data.groupby(pd.cut(data.x, 10))['y'].apply(get_outliers)

In [None]:
result.head()

In [None]:
index = result.index.droplevel(0)
data.iloc[index].sort_index()

## Now me Playing with the code for learning purposes.

In [25]:
result = pd.cut(data.x, 10)

In [27]:
result

0        (8.921, 9.908]
1        (7.935, 8.921]
2      (0.0325, 1.0289]
3      (3.00207, 3.989]
4        (6.948, 7.935]
5        (4.975, 5.962]
6        (5.962, 6.948]
7        (7.935, 8.921]
8        (4.975, 5.962]
9      (3.00207, 3.989]
10       (6.948, 7.935]
11       (6.948, 7.935]
12       (8.921, 9.908]
13       (4.975, 5.962]
14     (1.0289, 2.0155]
15       (7.935, 8.921]
16       (5.962, 6.948]
17    (2.0155, 3.00207]
18       (8.921, 9.908]
19       (6.948, 7.935]
20       (5.962, 6.948]
21       (4.975, 5.962]
22     (0.0325, 1.0289]
23       (8.921, 9.908]
24       (3.989, 4.975]
25       (5.962, 6.948]
26       (5.962, 6.948]
27       (8.921, 9.908]
28       (3.989, 4.975]
29     (1.0289, 2.0155]
            ...        
70     (0.0325, 1.0289]
71       (4.975, 5.962]
72       (5.962, 6.948]
73       (5.962, 6.948]
74     (3.00207, 3.989]
75       (3.989, 4.975]
76       (7.935, 8.921]
77       (4.975, 5.962]
78     (1.0289, 2.0155]
79     (0.0325, 1.0289]
80       (5.962,

In [24]:
pd.cut?

### - Dictionaries and dataframes

In [None]:
#Just a simple dictionary with string for key, and int for value. 
new_dict_0 = {"x":1,
            "y":2,
            "z":3}
print(new_dict_0)

In [None]:
#now lets try using a list of values to one key
new_dict_1 =  {"x":[1,2,3,4,5],
               "y":[1,2,3,4,5],
               "z":[1,2,3,4,5]}
print(new_dict_1)
type(new_dict_1)

In [None]:
#Can we convert a dict into a dataframe? 
my_dataframe_0 = pd.DataFrame(new_dict_1)
print(my_dataframe_0)
print(type(my_dataframe_0))
my_dataframe_0.head()

### - Random module 

In [None]:
#the numpy random package, this is numpy object called nd array. outputs floats between 0 and 1. May have dimensions. 
object_1 = np.random.rand(5)*10 #multiply by 10 just to get bigger values (above 1). 
print(object_1)
type(object_1)

#play with input of np.random.rand(x,y,z)... try (2,5) or (5,2) or (2,5,5)

### - Dataframes from random modules

In [None]:
fake_data_0 = pd.DataFrame({"x": np.random.rand(5)*10,
                            "y": np.random.rand(5)*10})
fake_data_1 = {"z": np.random.rand(20)*10}
fake_data_2 = pd.DataFrame({"q": np.random.rand(5)*10})

In [None]:
fake_data_0.head()

In [None]:
list_0 = [1,2,3,4,5,6]
list_0[3]

In [None]:
#trying to understand the following function:
def get_outliers_1(x): 
    mu = x.mean()   #this works because the input is a dataframe object which has built in attributes (mean, std)
    std = x.std()
    return x[(x < mu - std) | (x > mu + std)]


In [None]:
#testing attributes of a dataframe 
fake_data_0.mean()  #note that fake_data_1.mean(), and list_0.mean() wont work because only df object has this attribute

# Jake VanderPlas tutorial on groupby 
### from http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb

### simple aggregation in pandas

#understanding np.random.RandomState 

`RandomState`, besides being
NumPy-aware, has the advantage that it provides a much larger number
of probability distributions to choose from.

In [None]:
#Understanding RandomState 
rng = np.random.RandomState(43) #all this does is creates a random number generator that uses a "seed" value for start. 
rng.rand(5) #randomly seeded list of 5 floats from 0 to 1

In [None]:
ser = pd.Series(rng.rand(5))# Series is One-dimensional ndarray with axis labels
ser

In [None]:
#lets import some data and play with it
import seaborn as sns
planets = sns.load_dataset('planets')
type(planets)
planets.head()

In [None]:
planets.mean() #The mean of every column

In [None]:
planets.dropna().describe()  #dropping NaN values and then doing basic statistics. 

In [None]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)})#, columns=['key', 'data'])

In [None]:
df.head()

In [None]:
df.groupby("key").count() #the df.groupby("key") creates an object to which apply cool attributes (min, mean, stdev) 

### Column Indexing 

In [None]:
planets.groupby("method").count() #since the number of lines for each method is the same, counts is identical

In [None]:
planets.groupby("method").mean() #this groups by method name, then takes the mean of each column. super nice :) 

In [None]:
#If you only care about a specific column such as "orbital_period"
planets.groupby("method")["orbital_period"].mean()

### Iterating over groups

In [None]:
#Ok, so grouped dataframe has two componenets, the name by which it was grouped, and the assocated data frame... 
dictionary = {}
for x, y in planets.groupby("method"): #x = method, y = group
    dictionary[x] = y
    
dictionary["Astrometry"] # this just one of the groups without any stats being done on it. 

In [None]:
type(dictionary["Astrometry"])

In [None]:
#continueing with the example 
for method, group in planets.groupby("method"): #think of this as a dictionary name and dataframe that goes with it
    print("{0:30} shape ={1}".format(method, group.shape)) #why is it not shape() ??? stil done understand objects?

In [None]:
planets.groupby('method')['year'].describe().unstack()

### Aggregate, Filter, Transform, Apply

### An aside into "Min max and everything in between" - aggregation functon 
    http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.04-Computation-on-arrays-aggregates.ipynb
    

In [None]:
#the numpy random module (python also has a random module)
s = np.random.rand(2,4) #this is a float from 0 to 1 , 2 arrays consisting of 4 values
type(s)
s

In [None]:
p = np.random.randint(0,100,3) #creates a single array between 0, 100, and length of 3
type(p)
p

In [None]:
x=np.random.random(100) #outputs an array of 100 floats all between [0.0, 1.0)
x

In [None]:
x.sum() #numpys sum attribute (computed much more quickly)
sum(x)  #python built in function

In [None]:
#example of computing speed during sum
long_array = np.random.random(1000000)
%timeit sum(long_array)
%timeit long_array.sum()  #be careful of the meaning of sum, the np.sum() is aware of array dimensions. 

In [None]:
#similarly, there is min and max
%timeit min(long_array)
%timeit long_array.min()

In [51]:
#use a tuple to make multi-demensional array with random
array = np.random.random((2,4))
print(array)

[[ 0.85093253  0.26643857  0.88758823  0.69183359]
 [ 0.49824726  0.16556974  0.7067266   0.44926   ]]


In [52]:
array1 = np.random.rand(2,4) # i see many similarities between random.random and random.rand 
print(array1)

[[ 0.01856441  0.64030432  0.1991209   0.68310912]
 [ 0.834834    0.33075287  0.45807626  0.90459984]]


In [53]:
#sum takes the sum of the entire array 
array1.sum()

4.069361729650776

In [54]:
#once and for all, axis = 0 is the rows (that will be collapsed and min obtained), axis=1 is columns that are collapsed
array1.min(axis=0)

array([ 0.01856441,  0.33075287,  0.1991209 ,  0.68310912])

In [55]:
array1.min(axis=1)

array([ 0.01856441,  0.33075287])

The axis keyword specifies the dimension of the array that will be collapsed, rather than the dimension that will be returned. So specifying axis=0 means that the first axis will be collapsed: for two-dimensional arrays, this means that values within each column will be aggregated.

### Taking an aside to understand str() vs repr()
str() is meant to return representations of values which are fairly human readable
repr() is meant to generate a representation which can be read by the interpreter (or generate syntax error if no such syntax.

In [None]:
#repr leaves quatations around a string 
s = "Hello world\n2\n3\n4"
print(s)
print(str(s))
print(repr(s))


In [None]:
d = 1/7
str(d) #if print this, there will be no quotes 

In [None]:
repr(d)#if print this, there will be no quotes 

In [None]:
print("The value of rpr(d) is " + repr(d) + " The value of str of d is " + str(d) + " And value of str(s) is not printed" +
     " And rpr(s) is " + repr(s) + " And repr(50*30) is " + repr(16*7) + " And str(16*7) is " + str(16*7))

In [None]:
some_list = [[1,2,3],[3,4,5],[6,7,8]]
print(some_list)
print(str(some_list))
print(repr(some_list))

In [None]:
some_tuple = ((1,2,3),(3,4,5),(5,6,7))
print(some_tuple)
print(str(some_tuple))
print(repr(some_tuple))

In [None]:
some_dict = {"x":[1,2,3,4],
            "y":[4,5,6,7],
            "z":[8,6,1,1,1,1,1,]}
print(some_dict)
print(str(some_dict))
print(repr(some_dict))

### Taking one more aside to play with .format()

In [1]:
"{:^10}".format("test")

'   test   '

In [2]:
"{:10d}".format(42)

'        42'

In [3]:
"{:.3f}".format(4232.1231412) #.3 means 3 places after decimal, how to limit total number of characters?

'4232.123'

In [4]:
'{:10}'.format('test')

'test      '

### Binning Data Example by Chris Albon
http://chrisalbon.com/python/pandas_binning_data.html

In [28]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [29]:
bins = [0, 25, 50, 75, 100]
group_names = ['Low', 'Okay', 'Good', 'Great']

In [32]:
categories = pd.cut(df['postTestScore'], bins)#, labels=group_names)

In [33]:
categories

0       (0, 25]
1     (75, 100]
2      (50, 75]
3      (50, 75]
4      (50, 75]
5       (0, 25]
6     (75, 100]
7      (50, 75]
8      (50, 75]
9      (50, 75]
10     (50, 75]
11     (50, 75]
Name: postTestScore, dtype: category
Categories (4, object): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [34]:
pd.groupby?

### Trying to answer the question of wether I can group a dataframe using a different object(i.e array of same length)
Answer: as long as the array is the same length it will group by the index of the datafram and the array. 

In [38]:
dict_of_numbers = {"heigh":[160,165,173,158],
                  "weifht":[76,80,72,52]}
age_array = np.array([20,20,35,35])

In [50]:
age_array

array([20, 20, 35, 35])

In [40]:
df_dict_of_numbers = pd.DataFrame(dict_of_numbers)

In [48]:
groupws_df = df_dict_of_numbers.groupby(age_array).mean()

In [49]:
groupws_df

Unnamed: 0,heigh,weifht
20,162.5,78
35,165.5,62
