In [2]:
import pandas as pd
people = {
    'first': ['Corey', 'Jane', 'John'],
    'last': ['Schafer', 'Doe', 'Doe'],
    'email': ['CMS@gmail.com', 'JaneDoe@gmail.com','JD@gmail.com']
}

dftest = pd.DataFrame(people)

In [3]:
dftest.columns

Index(['first', 'last', 'email'], dtype='object')

In [8]:
dftest.columns = ['first name', 'last name', 'email']
dftest

Unnamed: 0,first name,last name,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [10]:
dftest.columns = [x.upper() for x in dftest.columns]
dftest

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [11]:
dftest.columns = dftest.columns.str.replace(' ', '_')
dftest

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [12]:
dftest.columns = [x.lower() for x in dftest.columns]
dftest

Unnamed: 0,first_name,last_name,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [13]:
dftest.rename(columns = {'first_name': 'first', 'last_name': 'last'}, inplace = True)
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [18]:
dftest.loc[2] = ['John', 'Smith', 'JS@gmail.com']
dftest.loc[2]

first            John
last            Smith
email    JS@gmail.com
Name: 2, dtype: object

In [21]:
# or better
dftest.loc[2, ['last', 'email']] = ['Doe', 'JD@gmail.com']
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [22]:
dftest.loc[2, ['last']] = 'Smith'
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JD@gmail.com


In [24]:
dftest.at[2, ['last']] = 'Doe'
dftest ## same result as loc

Unnamed: 0,first,last,email
0,Corey,Schafer,CMS@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JD@gmail.com


In [25]:
dftest['email'] = dftest['email'].str.lower()
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,cms@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Doe,jd@gmail.com


In [26]:
# apply 
# series apply to every value
# df apply to each row or col not each cell!!
dftest['email'].apply(len)

0    13
1    17
2    12
Name: email, dtype: int64

In [33]:
dftest.apply(len) # to each col 
dftest.apply(len, axis = 'columns') # to each row

0    3
1    3
2    3
dtype: int64

In [34]:
dftest.apply(pd.Series.min)
#dftest.apply(lambda x:x.min()) #same as above

first            Corey
last               Doe
email    cms@gmail.com
dtype: object

In [27]:
## eg
def update_email(email):
    return email.upper()
dftest['email'].apply(update_email)

0        CMS@GMAIL.COM
1    JANEDOE@GMAIL.COM
2         JD@GMAIL.COM
Name: email, dtype: object

In [28]:
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,cms@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Doe,jd@gmail.com


In [29]:
dftest['email'] = dftest['email'].apply(update_email)
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,CMS@GMAIL.COM
1,Jane,Doe,JANEDOE@GMAIL.COM
2,John,Doe,JD@GMAIL.COM


In [30]:
dftest['email'] = dftest['email'].apply(lambda x: x.lower())
dftest

Unnamed: 0,first,last,email
0,Corey,Schafer,cms@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Doe,jd@gmail.com


In [35]:
dftest.applymap(len) # apply to each cell

Unnamed: 0,first,last,email
0,5,7,13
1,4,3,17
2,4,3,12


In [36]:
dftest.applymap(str.lower)

Unnamed: 0,first,last,email
0,corey,schafer,cms@gmail.com
1,jane,doe,janedoe@gmail.com
2,john,doe,jd@gmail.com


In [38]:
# map substitute
dftest['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [39]:
# replace
dftest['first'] = dftest['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})
dftest

Unnamed: 0,first,last,email
0,Chris,Schafer,cms@gmail.com
1,Mary,Doe,janedoe@gmail.com
2,John,Doe,jd@gmail.com


In [52]:
df = pd.read_csv('data/survey_results_public.csv', index_col = 'Respondent')
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [53]:
df.rename(columns = {'ConvertedComp': 'SalaryUSD'}, inplace = True)
df['SalaryUSD']

Respondent
1            NaN
2            NaN
3         8820.0
4        61000.0
5            NaN
          ...   
88377        NaN
88601        NaN
88802        NaN
88816        NaN
88863        NaN
Name: SalaryUSD, Length: 88883, dtype: float64

In [54]:
df['Hobbyist'].value_counts()

Yes    71257
No     17626
Name: Hobbyist, dtype: int64

In [55]:
df['Hobbyist'] = df['Hobbyist'].map({'Yes': True, 'No': False})
#df['Hobbyist'] = df['Hobbyist'].replace({'Yes': True, 'No': False})
df['Hobbyist']

Respondent
1         True
2        False
3         True
4        False
5         True
         ...  
88377     True
88601    False
88802    False
88816    False
88863     True
Name: Hobbyist, Length: 88883, dtype: bool