In [2]:
import numpy as np
import pandas as pd
import os
import uuid

In [3]:
my_index = ['USA', 'Canada', 'Mexico']
my_data = [1776, 1867, 1821]

In [4]:
my_series = pd.Series(data=my_data, index=my_index)
my_series

USA       1776
Canada    1867
Mexico    1821
dtype: int64

In [5]:
#labeled index
my_series['USA']

1776

In [6]:
#numeric index
my_series[0]

1776

In [7]:
ages = {'Sam': 5, 'Frank': 10, 'Spike': 7 }
pd.Series(ages)

Sam       5
Frank    10
Spike     7
dtype: int64

### Imaginary sales data 

In [8]:
q1 = {'Japan': 80, 'China': 450, 'India': 200, 'USA': 250}
q2 = {'Brazil': 100, 'China': 500, 'India': 210, 'USA': 260}

In [9]:
sales_q1 = pd.Series(q1)
sales_q2 = pd.Series(q2)

In [10]:
#to get the keys or index
sales_q1.keys()

Index(['Japan', 'China', 'India', 'USA'], dtype='object')

In [11]:
# to combine series
total_sales = sales_q1.add(sales_q2, fill_value=0)
total_sales

Brazil    100.0
China     950.0
India     410.0
Japan      80.0
USA       510.0
dtype: float64

### DATAFRAMES

In [12]:
np.random.seed(101)
my_data = np.random.randint(0,101,(4,3))

In [13]:
my_index = ['CA', 'NY', 'AZ', 'TX']
my_columns = ['Jan', 'Feb', 'Mar']

In [14]:
df = pd.DataFrame(data=my_data, index=my_index, columns=my_columns)
df

Unnamed: 0,Jan,Feb,Mar
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [15]:
#get current file location
os.getcwd()

'/Users/chestergarettcalingacion/Documents/python_projects/ml_pathway'

In [16]:
# get all files of current directory
os.listdir()

['numpy_101.ipynb',
 'pandas_101.ipynb',
 '.ipynb_checkpoints',
 '.git',
 'sample_files']

In [17]:
df = pd.read_csv(r'sample_files/tips.csv')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [18]:
#get columns
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [19]:
#get the indexes
df.index

RangeIndex(start=0, stop=244, step=1)

In [20]:
#calculate basic statistical calculation
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


In [21]:
#extract column
df['total_bill'].head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64

In [22]:
my_cols = ['total_bill', 'tip']
df[my_cols]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [23]:
df['tip_percent'] = 100* (df['tip'] / df['total_bill'])
df['price_per_person'] = np.round(df['total_bill']/df['size'],2)

In [24]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percent,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.45
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765,6.15


In [25]:
#removing columns
df.drop('tip_percent',axis=1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91


In [26]:
##adding unique identifier
df['UUID'] = 0
for i in range(0,len(df)-1):
    df['UUID'][i] = str(uuid.uuid4()).split('-')[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['UUID'][i] = str(uuid.uuid4()).split('-')[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [27]:
### setting an index
df_index =  df.set_index('UUID')
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percent,price_per_person,UUID
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.49,6f1175e7
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.45,fc527db5


In [30]:
#extract multiple rows based on index
df_index.loc[['6f1175e7', 'fc527db5']]

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,tip_percent,price_per_person
UUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6f1175e7,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,8.49
fc527db5,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,3.45


In [None]:
#dropping the rows
df_index.drop(['ecd0a807', 'c712a688'], axis=0)

In [None]:
one_row = df.iloc[0]

In [None]:
##appending rows
df = df.append(one_row)

### CONDITIONAL FILTERING

In [32]:
df[df['total_bill'] > 40]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percent,price_per_person,UUID
59,48.27,6.73,Male,No,Sat,Dinner,4,13.942407,12.07,67254828
95,40.17,4.73,Male,Yes,Fri,Dinner,4,11.774956,10.04,0c227823
102,44.3,2.5,Female,Yes,Sat,Dinner,3,5.643341,14.77,4c21ca57
142,41.19,5.0,Male,No,Thur,Lunch,5,12.138869,8.24,1b92244a
156,48.17,5.0,Male,No,Sun,Dinner,6,10.379905,8.03,21600273
170,50.81,10.0,Male,Yes,Sat,Dinner,3,19.681165,16.94,7822b1e1
182,45.35,3.5,Male,Yes,Sun,Dinner,3,7.717751,15.12,3d4a4a89
184,40.55,3.0,Male,Yes,Sun,Dinner,2,7.398274,20.27,2b3c65dc
197,43.11,5.0,Female,Yes,Thur,Lunch,4,11.598237,10.78,94ae3fba
212,48.33,9.0,Male,No,Sat,Dinner,4,18.621974,12.08,d4170bde


In [34]:
#MULTIPLE CONDITIONS using and condition
df[(df['total_bill']>30) & (df['sex']=='Male')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percent,price_per_person,UUID
23,39.42,7.58,Male,No,Sat,Dinner,4,19.228818,9.86,7a354150
39,31.27,5.0,Male,No,Sat,Dinner,3,15.989767,10.42,35ccfd1c
44,30.4,5.6,Male,No,Sun,Dinner,4,18.421053,7.6,4bac0e12
47,32.4,6.0,Male,No,Sun,Dinner,4,18.518519,8.1,0a51f5a1
56,38.01,3.0,Male,Yes,Sat,Dinner,4,7.89266,9.5,6befed44
59,48.27,6.73,Male,No,Sat,Dinner,4,13.942407,12.07,67254828
83,32.68,5.0,Male,Yes,Thur,Lunch,2,15.299878,16.34,d1775aba
95,40.17,4.73,Male,Yes,Fri,Dinner,4,11.774956,10.04,0c227823
112,38.07,4.0,Male,No,Sun,Dinner,3,10.506961,12.69,90979f8f
141,34.3,6.7,Male,No,Thur,Lunch,6,19.533528,5.72,50eb00c1
