In [3]:
import pandas as pd

In [None]:
df = pd.read_csv('survey_results_public.csv')
schema_df = pd.read_csv('survey_results_schema.csv')

In [None]:
### The first row allows me to see all columns in df, while the second all rows in schema_df.

In [None]:
pd.set_option('display.max_columns',114)
pd.set_option('display.max_rows',87)

In [None]:
df.head(10) # df.tail(n) to visualize last rows.
df.shape
df.info()

In [None]:
### df = pd.DataFrame(dictionary) -> create a df from a dictionary having
### values (rows) as lists and keys corresponding to each column.

In [None]:
### A df is a container for multiple Series objects' (single column of rows).

In [None]:
df['Age'].value_counts()

In [None]:
type(df['Age'])

In [None]:
df.Age

In [None]:
# Not a series anymore -> just a filtered df

In [None]:
df[['Age','Country']]

In [None]:
df.columns # Series containing values of the first row of data.

In [None]:
df.iloc[[0,1]] # integer location.

In [None]:
df.iloc[[0,1],[1,2]]

In [None]:
# df.loc[] -> searching by label for rows (indexes) -> this will make sense later
# I can pass a list as a second inner [] argument

In [None]:
df.loc[[0,1],['RemoteWork','Country']]

In [None]:
df.loc[0,'Country']

In [None]:
df.loc[0:2,'Country':'LanguageAdmired'] # 2nd slice item is INCLUSIVE.

#### Index customization instead of default integer identifier for rows -> set unique labels

In [None]:
df.index

In [None]:
df.set_index('Country', inplace=True)

In [None]:
# iloc still works.

In [None]:
df.loc['Canada','EdLevel']

In [None]:
df.reset_index(inplace=True)

In [None]:
df = pd.read_csv('survey_results_public.csv', index_col='ResponseId') # Alternative to set_index()
schema_df = pd.read_csv('survey_results_schema.csv', index_col='qname') # Changing index for the schema.

In [None]:
schema_df.loc['BuildvsBuy']

#### What if I wanted to see complete text?

In [None]:
schema_df.loc['BuildvsBuy','question']

In [None]:
df.loc[39,'BuildvsBuy']

In [None]:
schema_df.sort_index(ascending=False).head() # Sort indexes in alphabetical order -> optionally: inplace=True

In [None]:
filt = (df['Country'] == 'Canada')
filt # series of True / False values.

In [None]:
df[filt]

In [None]:
# df[df['Country'] == 'Canada']

In [None]:
df.loc[filt, 'Age'] # filt (1st argument) represents rows.

In [None]:
filt2 = (df['Employment'] == 'Student, full-time') & (df['Age'] == '18-24 years old')
df.loc[filt2, ['MainBranch','YearsCode','LanguageHaveWorkedWith']]

In [None]:
# | as 'or' operator + if I want the opposite of a filter -> tilda before filt: df[~filt]
# logical values are allowed of course.

In [None]:
countries = ['India','Germany','Canada']
filt3 = df['Country'].isin(countries)
df.loc[filt3,'Country']

In [None]:
df['LanguageHaveWorkedWith']

In [None]:
filt4 = df['LanguageHaveWorkedWith'].str.contains('Python', na=False)

In [None]:
df.loc[filt4, 'LanguageHaveWorkedWith']

In [None]:
# df.columns = ['first_name'] as dimnames in R.

In [None]:
# df.rename(columns={'first_name':'first'}, inplace=True) -> nameToChange : newName

In [None]:
df.columns = [x.upper() for x in df.columns]

In [None]:
# df.columns = df.columns.str.replace(' ','_')

In [None]:
df.loc[2]

In [None]:
# df.loc[2] = [...] -> pass a list to substitute each element in the row.

In [None]:
# I can use df.at[] instead of df.loc[]

In [None]:
df.loc[2,['EMPLOYMENT','REMOTEWORK']] = ['jobs_act','rem']
df.loc[2,'CHECK'] = 'kiwis'
df.loc[2]

In [None]:
# filt = df(['email'] == 'mf@gmail.com')
# df[filt]['surname'] = 'tutuf'
# ERROR!!!!!!! -> value set on a temporary object that will immediately go away.
# always use .loc[] or .at[]

In [None]:
df['CHECK'] = df['CHECK'].str.lower()

#### 4 useful methods:
#### apply() to every item in our series
#### applymap works only on df, not series objects
#### map works on series only
#### replace

In [None]:
df['CHECK'].apply(len)

In [None]:
def update_check(check):
    return check.upper()

In [None]:
df['CHECK'] = df['CHECK'].apply(update_check)

In [None]:
df['CHECK'] = df['CHECK'].apply(lambda x: x.lower()) # lambda functions.

In [None]:
df.apply(len) # apply() function to each series.

In [None]:
df.apply(len,axis='columns') # 'rows' is default.

##
df.applymap(str.lower) --
df.applymap(len) --
df.apply(lambda x: x.min()) --
df.apply(pd.Series.min) which should return the first string in alphabetical order.

In [None]:
df['COUNTRY'].map({'canada':'cd'})
# values we did not substitute are converted to NaN. 

In [None]:
df['COUNTRY'] = df['COUNTRY'].replace({'canada':'cd'})

In [None]:
df.rename(columns={'COMPTOTAL':'salaryUSD'}, inplace=True)

In [None]:
# Good for binary outcomes -> eg hobbyist yer or no?

In [None]:
df['COUNTRY'] = df['COUNTRY'].map({'cd':True, 'united states of america':False}) # Alternatively, use replace()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('survey_results_public.csv')
schema_df = pd.read_csv('survey_results_schema.csv')

In [None]:
pd.set_option('display.max_columns',114)
pd.set_option('display.max_rows',87)

In [None]:
df.set_index('ResponseId', inplace=True)

##### The line below is essential for using the split() method later on.

In [None]:
df['Country'] = df['Country'].str.replace(' ','_')
df['Age'] = df['Age'].str.replace(' ','_')

In [None]:
df['Country'] + ' ' + df['Age']

In [None]:
df['Combined'] = df['Country'] + ' ' + df['Age']

In [None]:
df.drop(columns=['Country','Age'], inplace=True)

In [None]:
df['Combined'].str.split(' ', expand=True)

In [None]:
df[['Country','Age']] = df['Combined'].str.split(' ', expand=True)

In [None]:
df._append({'Employment':'Busy'}, ignore_index=True)

#### Unifying two data frames

In [1]:
respondents = {
    'Employment':['Manager','Teacher'],
    'Country':['Italy','Italy']
}

In [4]:
df2 = pd.DataFrame(respondents)

In [5]:
df2.head()

Unnamed: 0,Employment,Country
0,Manager,Italy
1,Teacher,Italy


##### sort kwarg avoids columns to be automatically sorted + _append does not have inplace

In [None]:
df = df._append(df2,ignore_index=True,sort=False)

In [None]:
df.drop(index=65437, inplace=True)

In [None]:
filt = df['Country'] == 'Italy'
df.drop(index=df[filt].index)

##### If country is equal, CompTotal is considered

In [None]:
df.sort_values(by=['Country','CompTotal'],ascending=[False,True],
               inplace=True)

In [None]:
df.sort_index()

In [None]:
df['Age'].sort_values()

In [None]:
df.sort_values(by='Country',inplace=True)

In [None]:
df.sort_values(by=['Country','CompTotal'],ascending=[True,False],
               inplace=True)

In [None]:
df[['Country','CompTotal']].head(50)

In [None]:
df['CompTotal'].nlargest(10)

In [None]:
df.nsmallest(10,'CompTotal')

## Aggregation Functions

In [None]:
df['CompTotal'].median()

##### cfr summary() in R
##### 'count' is counting the number of non-NaN responses

In [None]:
df.describe()

In [None]:
df['CompTotal'].count()

In [None]:
df['LanguageWantToWorkWith'].value_counts(normalize=True) # Output as %

#### SPLIT - APPLY FUNCTION - COMBINE RESULTS

In [None]:
df['Country'].value_counts()

#### split

In [None]:
dfGroupBy_object = df.groupby(['Country'])

In [None]:
dfGroupBy_object.get_group('India')

In [None]:
# very similar to a filter.
filt = df['Country'] == 'India'
df.loc[filt]

#### apply & grouping

In [None]:
filt = df['Country'] == 'India'
df.loc[filt]['LanguageWantToWorkWith'].value_counts()

In [None]:
dfGroupBy_object['LanguageWantToWorkWith'].value_counts().head(50)

In [None]:
dfGroupBy_object['LanguageWantToWorkWith'].value_counts().loc['China']

In [None]:
dfGroupBy_object['CompTotal'].median()

In [None]:
dfGroupBy_object['CompTotal'].median().loc['Italy']

In [None]:
dfGroupBy_object['CompTotal'].agg(['median','mean']).loc['Italy']

In [None]:
filt = df['Country'] == 'India'
df.loc[filt]['LanguageHaveWorkedWith'].str.contains('Python').value_counts(normalize=True)

In [None]:
filt = df['Country'] == 'India'
df.loc[filt]['LanguageHaveWorkedWith'].str.contains('Python').sum()

In [None]:
# dfGroupBy_object['LanguageHaveWorkedWith'].str.contains('Python').sum()
# ERROR!!!

In [None]:
dfGroupBy_object['LanguageHaveWorkedWith'].apply(lambda x: x.str.contains('Python').sum())

In [None]:
dfGroupBy_object['LanguageHaveWorkedWith'].apply(lambda x: x.str.contains('Python',na=False).value_counts(normalize=True))
# I had forgotten na=False -> if not used, NaN is not considered in the total number of responses per country (neither True nor False) 

In [None]:
country_respondents = df['Country'].value_counts()
country_pythoners = dfGroupBy_object['LanguageHaveWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
python_df = pd.concat([country_respondents,country_pythoners],axis='columns',sort=False)
python_df.rename(columns={'count':'NumRespondents','LanguageHaveWorkedWith':'NumKnowsPython'}, inplace=True)
python_df

In [None]:
python_df['%_knowsPython'] = (python_df['NumKnowsPython']/python_df['NumRespondents'])*100

In [None]:
python_df.sort_values(by='%_knowsPython', ascending=False, inplace=True)
python_df.head(50)

In [None]:
python_df.loc['Japan']

## Casting Datatypes and Handling Missing Values

#### None, np.nan, 'NA', 'Missing' (customized)

In [None]:
df.dropna(axis='index', how='any') # default arguments
# -> dropping rows if there is at least 1 missing value.

In [None]:
df.dropna(axis='columns', how='all')

In [None]:
df.dropna(axis='index', how='all', subset=['Country'])
# subset kwarg does not work if axis='column'

In [None]:
df.dropna(axis='index', how='all', subset=['Country','Employment'])
# either or when how='all' -> with 'any', column is dropped if 1 is missing.

In [None]:
import numpy as np
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [None]:
df.isna()

In [None]:
df.fillna('MISSING', inplace=False)
# I may want to turn NaN into 0 when carrying out numerical computations.

In [None]:
df.dtypes # attribute

In [None]:
# df['Employment'].mean() -> you never want to do that!!!

In [None]:
# NaN value is a float under the hood.
type(np.nan)

In [None]:
df['JobSat'] = df['JobSat'].astype(float)

In [None]:
df['CompTotal'].mean()

In [None]:
# df.astype(...) -> useless when we have mixed columns.

In [None]:
na_vals = ['NA','Missing']
df = pd.read_csv('survey_results_public.csv', index_col='ResponseId',
                 na_values=na_vals)

In [None]:
df['YearsCode'].head(10)

In [None]:
df['YearsCode'].unique() # method

In [None]:
df['YearsCode'].replace('Less than 1 year',0,inplace=True)
df['YearsCode'].replace('More than 50 years',51,inplace=True)

In [None]:
df['YearsCode'] = df['YearsCode'].astype(float)

In [None]:
df['YearsCode'].mean()
df['YearsCode'].median()

## Date and Time Series Data

In [None]:
import pandas as pd
df = pd.read_csv('ETH_1h.csv')
df.head()

In [None]:
df.shape

In [None]:
df.loc[0,'Date'] # string

##### specify datetime format -> formatting string
##### https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
##### easier to just insert a kwarg when loading the file, BUT alternative:
##### df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %I-%p')

In [None]:
df['Date']

##### x is the variable (each index)
##### d_parser = lambda x: pd.datetime.strptime(x,'%Y-%m-%d %I-%p')
##### kwarg date_parser=d_parser is deprecated -> use date_format instead 

In [None]:
df = pd.read_csv('ETH_1h.csv', parse_dates=['Date'], date_format='%Y-%m-%d %I-%p')

In [None]:
df.loc[0,'Date'].day_name()

In [None]:
df['Date'].dt.day_name() # dt class on Series object

In [None]:
df['DayOfWeek'] = df['Date'].dt.day_name()

In [None]:
df['Date'].min()

In [None]:
df['Date'].max()

In [None]:
df['Date'].max() - df['Date'].min() # time delta

In [None]:
filt = (df['Date'] >= '2019') & (df['Date'] < '2020')
df.loc[filt]

In [None]:
# Alternative
filt = (df['Date'] >= pd.to_datetime('2019-01-01')) & (df['Date'] < pd.to_datetime('2020-01-01'))

In [None]:
df.set_index('Date',inplace=True)

In [None]:
df.loc['2019']

In [None]:
# slicing - 2nd item is inclusive
df = df.sort_index()
df.loc['2020-01':'2020-02']

In [None]:
df.loc['2020-01':'2020-02']['Close'].mean()

In [None]:
 df.loc['2020-01-01']['High'].max()

### Resampling

In [None]:
highs = df['High'].resample('D').max()
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects

In [None]:
highs['2020-01-01']

In [None]:
%matplotlib inline

In [None]:
highs.plot()

In [None]:
# df.resample('W').mean()
# Same aggregation method for each column in the df -> useless if there are many data types in the df.

In [None]:
df.resample('W').agg({'Close':'mean', 'High':'max', 'Low':'min', 'Volume':'sum'})

## Reading/Writing Data to Different Sources

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('survey_results_public.csv', index_col='ResponseId')
schema_df = pd.read_csv('survey_results_schema.csv', index_col='qname')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head()

In [None]:
filt = (df['Country'] == 'India')
india_df = df.loc[filt]
india_df.head()

In [None]:
india_df.to_csv('modified.csv')

In [None]:
india_df.to_csv('modified.tsv', sep='\t')

In [None]:
india_df.to_excel('modified.xlsx')

In [None]:
test = pd.read_excel('modified.xlsx', index_col='ResponseId')

In [None]:
test

In [None]:
india_df.to_json('modified.json', orient='records', lines=True)

In [None]:
test = pd.read_json('modified.json', orient='records', lines=True)

In [None]:
test

## SQL

In [None]:
from sqlalchemy import create_engine
import psycopg2

In [None]:
engine = create_engine('postgresql://dbuser:dbpass@localhost:5432/sample_db')

In [None]:
india_df.to_sql('sample_table', engine, if_exists='replace')

In [None]:
sql_df = pd.read_sql('sample_table', engine, index_col='ResponseId')

In [None]:
sql_df.head()

In [None]:
sql_df = pd.read_sql_query('SELECT * FROM sample_table', engine, index_col='ResponseId')

In [None]:
sql_df.head()

##### Reading URLs

In [None]:
posts_df = pd.read_json('https://raw.githubusercontent.com/CoreyMSchafer/code_snippets/master/Python/Flask_Blog/snippets/posts.json')

In [None]:
posts_df.head()