### Introduction to Pandas

Through pandas, you get acquainted with your data by cleaning, transforming, and analyzing it.

In [14]:
import pandas as pd

The primary two components of pandas are the Series and DataFrame.

A Series is essentially a column, and a DataFrame is a multi-dimensional table made up of a collection of Series.

#### Creating Dataframes

In [None]:
#from a dictionary
data = {
    'store_id': [1, 2, 3, 4],
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}

In [None]:
inventory = pd.DataFrame(data)
inventory
#Each (key, value) item in data corresponds to a column in the resulting DataFrame.

In [None]:
#establish a new index with actual names
inventory = pd.DataFrame(data, index=['Trader Joes', 'Star Market', 'Whole Foods', 'Local Deli'])
inventory

In [None]:
#delete a column from the inventory df
del inventory['store_id']

In [None]:
inventory

In [None]:
#locate Trader Joe's inventory
inventory.loc['Trader Joes']

In [None]:
#locate two stores' inventory
inventory.loc[['Trader Joes','Star Market']]

#### Dataframes from CSV files

In [20]:
df = pd.read_csv('C:/Users/Ergo/Desktop/Fall2018Statistics/wages.csv')
#df

In [16]:
#inspect the dataframe's top 5 rows
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,159142,73.89,male,white,16,49
1,192794,66.23,female,white,16,62
2,97422,63.77,female,white,16,33
3,160956,63.22,female,asian,16,95
4,164178,63.08,female,white,17,43


In [22]:
#get the top 10 rows
df.head(10)

Unnamed: 0,earn,height,sex,race,ed,age
0,159142,73.89,male,white,16,49
1,192794,66.23,female,white,16,62
2,97422,63.77,female,white,16,33
3,160956,63.22,female,asian,16,95
4,164178,63.08,female,white,17,43
5,30626,64.53,female,white,15,30
6,94208,61.54,female,white,12,53
7,101920,73.29,male,white,17,50
8,6426,72.24,male,hispanic,15,25
9,85994,72.4,male,white,12,30


In [17]:
#inspect the dataframe's bottom 5 rows
df.tail()

Unnamed: 0,earn,height,sex,race,ed,age
1374,60346,71.68,male,white,12,33
1375,49708,61.31,female,white,18,86
1376,27422,63.64,female,white,12,37
1377,190852,71.65,male,white,12,54
1378,19150,68.22,male,white,12,31


In [23]:
#get the bottom 10 rows
df.tail(10)

Unnamed: 0,earn,height,sex,race,ed,age
1369,9512,72.94,male,hispanic,15,24
1370,351802,65.9,female,asian,18,52
1371,174948,68.82,male,white,18,75
1372,184412,69.62,male,white,18,57
1373,33812,70.08,female,white,16,40
1374,60346,71.68,male,white,12,33
1375,49708,61.31,female,white,18,86
1376,27422,63.64,female,white,12,37
1377,190852,71.65,male,white,12,54
1378,19150,68.22,male,white,12,31


In [21]:
#converting dataframe to csv
inventory.to_csv('inventory.csv')

#### Dataframe Operations

In [18]:
#get the length of a dataframe (aka nr of rows)
len(df)

1379

In [25]:
#get more information about your dataset
#.info() provides the essential details about your dataset, 
#such as the number of rows and columns, the number of non-null values, 
#what type of data is in each column, and how much memory your DataFrame is using.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 6 columns):
earn      1379 non-null int64
height    1379 non-null float64
sex       1379 non-null object
race      1379 non-null object
ed        1379 non-null int64
age       1379 non-null int64
dtypes: float64(1), int64(3), object(2)
memory usage: 64.7+ KB


In [26]:
#get how many rows and columns your dataset has
df.shape

#it outputs a tuple: rows first, columns next

(1379, 6)

In [27]:
#creates a copy of the same dataframe and adds it to the original df
df_temp=df.append(df)

In [29]:
len(df_temp)

2758

In [30]:
#remove duplicates
df_temp.drop_duplicates(inplace=True)

In [31]:
df_temp.shape

(1379, 6)

In [32]:
#get all columns of the df
df.columns

Index(['earn', 'height', 'sex', 'race', 'ed', 'age'], dtype='object')

In [34]:
#rename some of the columns
df.rename(columns={
        'earn': 'salary', 
        'ed': 'education'
    }, inplace=True)


df.columns

Index(['salary', 'height', 'sex', 'race', 'education', 'age'], dtype='object')