# Pandas Intro Commands

# Pandas dataframes

In [3]:
import pandas as pd
import numpy as np

In [4]:
#create a dataframe
df1=pd.DataFrame(columns=['age','education_level', 'birth_city', 'height'])
df1

Unnamed: 0,age,education_level,birth_city,height


In [5]:
#add values to dataframe via dictionary keys
values={'age':12, 'education_level': 'middle school', 'birth_city':'Toronto', 'height':140}

In [6]:
#append newly created values to df
df1=df1.append(values, ignore_index=True)
print(df1)

  age education_level birth_city height
0  12   middle school    Toronto    140


In [22]:
#populate a dataframe's column with random numbers
np.random.seed(20)
df1['age']=np.random.randint(15,45, size=len(df1))
df1['age']

0    18
1    41
2    30
3    43
Name: age, dtype: int64

In [23]:
#add x numbers of rows to dataframe by creating a list of series
#if you want to generate random numbers for age col and still pass a list
np.random.seed(20)
listOfSeries = [pd.Series([np.random.randint(15,45), 'college', 'New York', 160], index=df1.columns ) ,
                pd.Series([np.random.randint(15,45), 'grad school', 'Tokyo', 183], index=df1.columns ) ,
                pd.Series([np.random.randint(15,45), 'grad school', 'Las Vegas', 179], index=df1.columns ) ]

In [24]:
#pass the list of series to the dataframe to add rows
df1=df1.append(listOfSeries, ignore_index=True)

In [25]:
df1

Unnamed: 0,age,education_level,birth_city,height
0,18,middle school,Toronto,140
1,41,college,New York,160
2,30,grad school,Tokyo,183
3,43,grad school,Las Vegas,179
4,18,college,New York,160
5,41,grad school,Tokyo,183
6,30,grad school,Las Vegas,179


# Working with CSV

In [11]:
df_wine = pd.read_csv('wine.csv')
df_wine.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


In [12]:
#order by a specific attribute - in this case order by points column
df_wine.sort_values('points', ascending = 'True').head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
92914,92914,Spain,"Smells like canned fruit, and tastes like gree...",Clos Torribas Blanc de Blancs,80,10.0,Catalonia,Penedès,,White Blend,Pinord
96766,96766,Spain,There's something not right about the way this...,Gran Reserva Brut Nature,80,27.0,Catalonia,Cava,,Sparkling Blend,Oriol Rossell
96767,96767,Spain,This is mucky and sweet. Its best foot is not ...,,80,8.0,Northern Spain,Cariñena,,Garnacha,Castillo de Monséran
96768,96768,Spain,"Eight grapes comprise this weird, peanutty, al...",Las Ocho,80,25.0,Levante,Utiel-Requena,,Red Blend,Chozas Carrascal
96769,96769,US,"Port-like, with raisiny, baked cherry and cara...",Grand Reserve,80,40.0,California,California,California Other,Petit Verdot,French Hill


In [13]:
#group by a specific column and perform a calculation
#for example, group by country and perform average wine points
country_group=df_wine.groupby('country')

for i,j in country_group: #i- country, j-points
    print(i)
    print(j.points.mean())
    
#store this information in a dataframe itself
#store the calculation above as a function
def calc_wine_point_avg(country_group):
    res=[]
    for i,j in country_group:
        res.append(i,j.points.mean())
    return res

country_group=df_wine.groupby('country')
calc_wine_point_avg(country_group)

df_point_average=pd.DataFrame(res)

Albania
88.0
Argentina
85.9960930562955
Australia
87.89247528747227
Austria
89.27674190382729
Bosnia and Herzegovina
84.75
Brazil
83.24
Bulgaria
85.46753246753246
Canada
88.23979591836735
Chile
86.29676753782668
China
82.0
Croatia
86.28089887640449
Cyprus
85.87096774193549
Czech Republic
85.83333333333333
Egypt
83.66666666666667
England
92.88888888888889
France
88.92586975068727
Georgia
85.51162790697674
Germany
88.62642740619903
Greece
86.11764705882354
Hungary
87.32900432900433
India
87.625
Israel
87.17619047619047
Italy
88.41366385552432
Japan
85.0
Lebanon
85.70270270270271
Lithuania
84.25
Luxembourg
87.0
Macedonia
84.8125
Mexico
84.76190476190476
Moldova
84.71830985915493
Montenegro
82.0
Morocco
88.16666666666667
New Zealand
87.55421686746988
Portugal
88.05768508079669
Romania
84.92086330935251
Serbia
87.71428571428571
Slovakia
83.66666666666667
Slovenia
88.23404255319149
South Africa
87.22542072630647
South Korea
81.5
Spain
86.64658925979681
Switzerland
87.25
Tunisia
86.0
Turkey
8

TypeError: append() takes exactly one argument (2 given)

In [14]:
#store this information in a dataframe itself
#store the calculation above as a function, initialize a new df, do calculation and store the info into the new df
def calc_wine_point_avg(country_group):
    res_df =[]
    for i,j in country_group:
        res_df.append((i,j.points.mean()))
    new_df = pd.DataFrame(res_df)
    print(new_df)

country_group=df_wine.groupby('country') #operation you're doing with the original df
calc_wine_point_avg(country_group)


                         0          1
0                  Albania  88.000000
1                Argentina  85.996093
2                Australia  87.892475
3                  Austria  89.276742
4   Bosnia and Herzegovina  84.750000
5                   Brazil  83.240000
6                 Bulgaria  85.467532
7                   Canada  88.239796
8                    Chile  86.296768
9                    China  82.000000
10                 Croatia  86.280899
11                  Cyprus  85.870968
12          Czech Republic  85.833333
13                   Egypt  83.666667
14                 England  92.888889
15                  France  88.925870
16                 Georgia  85.511628
17                 Germany  88.626427
18                  Greece  86.117647
19                 Hungary  87.329004
20                   India  87.625000
21                  Israel  87.176190
22                   Italy  88.413664
23                   Japan  85.000000
24                 Lebanon  85.702703
25          

In [15]:
#if you want to do the group by within the function itself
def group_by_and_calc_avg(df):
    res_df=[]
    df=df.groupby('country')
    for i, j in df:
        res_df.append((i,j.points.mean()))
    new_df2=pd.DataFrame(res_df)
    print(new_df2)

group_by_and_calc_avg(df_wine)

                         0          1
0                  Albania  88.000000
1                Argentina  85.996093
2                Australia  87.892475
3                  Austria  89.276742
4   Bosnia and Herzegovina  84.750000
5                   Brazil  83.240000
6                 Bulgaria  85.467532
7                   Canada  88.239796
8                    Chile  86.296768
9                    China  82.000000
10                 Croatia  86.280899
11                  Cyprus  85.870968
12          Czech Republic  85.833333
13                   Egypt  83.666667
14                 England  92.888889
15                  France  88.925870
16                 Georgia  85.511628
17                 Germany  88.626427
18                  Greece  86.117647
19                 Hungary  87.329004
20                   India  87.625000
21                  Israel  87.176190
22                   Italy  88.413664
23                   Japan  85.000000
24                 Lebanon  85.702703
25          