# The following is Jake Vanderplas breaking down how to use groupby

In [1]:
import numpy as np
import pandas as pd

In [None]:
data = pd.DataFrame({'x': 10 * np.random.rand(100),
                     'y': 10 * np.random.rand(100)})
data.head()

In [None]:
def get_outliers(x):
    mu = x.mean()
    std = x.std()
    return x[(x < mu - std) | (x > mu + std)]

result = data.groupby(pd.cut(data.x, 10))['y'].apply(get_outliers)

In [None]:
result.head()

In [None]:
index = result.index.droplevel(0)
data.iloc[index].sort_index()

## Now me Playing with the code for learning purposes.

### - Dictionaries and dataframes

In [None]:
#Just a simple dictionary with string for key, and int for value. 
new_dict_0 = {"x":1,
            "y":2,
            "z":3}
print(new_dict_0)

In [None]:
#now lets try using a list of values to one key
new_dict_1 =  {"x":[1,2,3,4,5],
               "y":[1,2,3,4,5],
               "z":[1,2,3,4,5]}
print(new_dict_1)
type(new_dict_1)

In [None]:
#Can we convert a dict into a dataframe? 
my_dataframe_0 = pd.DataFrame(new_dict_1)
print(my_dataframe_0)
print(type(my_dataframe_0))
my_dataframe_0.head()

### - Random module 

In [None]:
#the numpy random package, this is numpy object called nd array. outputs floats between 0 and 1. May have dimensions. 
object_1 = np.random.rand(5)*10 #multiply by 10 just to get bigger values (above 1). 
print(object_1)
type(object_1)

#play with input of np.random.rand(x,y,z)... try (2,5) or (5,2) or (2,5,5)

### - Dataframes from random modules

In [None]:
fake_data_0 = pd.DataFrame({"x": np.random.rand(5)*10,
                            "y": np.random.rand(5)*10})
fake_data_1 = {"z": np.random.rand(20)*10}
fake_data_2 = pd.DataFrame({"q": np.random.rand(5)*10})

In [None]:
fake_data_0.head()

In [None]:
list_0 = [1,2,3,4,5,6]
list_0[3]

In [None]:
#trying to understand the following function:
def get_outliers_1(x): 
    mu = x.mean()   #this works because the input is a dataframe object which has built in attributes (mean, std)
    std = x.std()
    return x[(x < mu - std) | (x > mu + std)]


In [None]:
#testing attributes of a dataframe 
fake_data_0.mean()  #note that fake_data_1.mean(), and list_0.mean() wont work because only df object has this attribute

# Jake VanderPlas tutorial on groupby 
### from http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/03.08-Aggregation-and-Grouping.ipynb

In [None]:
#lets import some data and play with it
import seaborn as sns
planets = sns.load_dataset('planets')
type(planets)
planets.head()

In [None]:
planets.mean() #The mean of every column

In [None]:
planets.dropna().describe()  #dropping NaN values and then doing basic statistics. 

In [None]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)})#, columns=['key', 'data'])

In [None]:
df.head()

In [None]:
df.groupby("key").count() #the df.groupby("key") creates an object to which apply cool attributes (min, mean, stdev) 

### Column Indexing 

In [None]:
planets.groupby("method").count() #since the number of lines for each method is the same, counts is identical

In [None]:
planets.groupby("method").mean() #this groups by method name, then takes the mean of each column. super nice :) 

In [None]:
#If you only care about a specific column such as "orbital_period"
planets.groupby("method")["orbital_period"].mean()

### Iterating over groups

In [None]:
#Ok, so grouped dataframe has two componenets, the name by which it was grouped, and the assocated data frame... 
dictionary = {}
for x, y in planets.groupby("method"): #x = method, y = group
    dictionary[x] = y
    
dictionary["Astrometry"] # this just one of the groups without any stats being done on it. 

In [None]:
type(dictionary["Astrometry"])

In [None]:
#continueing with the example 
for method, group in planets.groupby("method"): #think of this as a dictionary name and dataframe that goes with it
    print("{0:30} shape ={1}".format(method, group.shape)) #why is it not shape() ??? stil done understand objects?

In [None]:
planets.groupby('method')['year'].describe().unstack()

### Understanding aggregate() filter() transfrom() apply()

In [20]:
rng = np.random.RandomState((2,3))

In [24]:
rnd.shape()

NameError: name 'rnd' is not defined

In [21]:
type(rng)

mtrand.RandomState

### Taking an aside to understand str() vs repr()
str() is meant to return representations of values which are fairly human readable
repr() is meant to generate a representation which can be read by the interpreter (or generate syntax error if no such syntax.

In [None]:
#repr leaves quatations around a string 
s = "Hello world\n2\n3\n4"
print(s)
print(str(s))
print(repr(s))


In [None]:
d = 1/7
str(d) #if print this, there will be no quotes 

In [None]:
repr(d)#if print this, there will be no quotes 

In [None]:
print("The value of rpr(d) is " + repr(d) + " The value of str of d is " + str(d) + " And value of str(s) is not printed" +
     " And rpr(s) is " + repr(s) + " And repr(50*30) is " + repr(16*7) + " And str(16*7) is " + str(16*7))

In [None]:
some_list = [[1,2,3],[3,4,5],[6,7,8]]
print(some_list)
print(str(some_list))
print(repr(some_list))

In [None]:
some_tuple = ((1,2,3),(3,4,5),(5,6,7))
print(some_tuple)
print(str(some_tuple))
print(repr(some_tuple))

In [None]:
some_dict = {"x":[1,2,3,4],
            "y":[4,5,6,7],
            "z":[8,6,1,1,1,1,1,]}
print(some_dict)
print(str(some_dict))
print(repr(some_dict))

### Taking one more aside to play with .format()

In [None]:
"{:^10}".format("test")

In [None]:
"{:10d}".format(42)

In [None]:
"{:.3f}".format(4232.1231412) #.3 means 3 places after decimal, how to limit total number of characters?

In [None]:
'{:10}'.format('test')

### Following the Min, Max, and everything in between tutorial by Jake VDP - focus on random modules
    http://nbviewer.jupyter.org/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/02.04-Computation-on-arrays-aggregates.ipynb