In [1]:
import pandas as pd

## 1. Read a tabular data file

In [2]:
orders = pd.read_table("../data-basics/chipotle.tsv")
pd.DataFrame(orders).head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [3]:
users= pd.read_table("../data-basics/movie.user")
pd.DataFrame(users).head()

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101


In [4]:
# such a raw dataset, let's handle it with parameter values
user_cols = ['user_id','age','gender','occupation','zip_code'] # make the cols names
users= pd.read_table("../data-basics/movie.user",sep='|',names=user_cols,header=None) # sep
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## 2. Select a Series from a DataFrame

In [5]:
users.age

0      24
1      53
2      23
3      24
4      33
       ..
938    26
939    32
940    20
941    48
942    22
Name: age, Length: 943, dtype: int64

In [6]:
# Or equivalently, it is:
users["age"]

0      24
1      53
2      23
3      24
4      33
       ..
938    26
939    32
940    20
941    48
942    22
Name: age, Length: 943, dtype: int64

**Bracket notation** will always work, whereas **dot notation** has limitations:

- Dot notation doesn't work if there are **spaces** in the Series name
- Dot notation doesn't work if the Series has the same name as a **DataFrame method or attribute** (like 'head' or 'shape')
- Dot notation can't be used to define the name of a **new Series** (see below)

In [7]:
ufo = pd.read_table("../data-basics/ufo.csv")
ufo.head() # so sometimes it is not about the sep function, but the read function

Unnamed: 0,"City,Colors Reported,Shape Reported,State,Time"
0,"Ithaca,,TRIANGLE,NY,6/1/1930 22:00"
1,"Willingboro,,OTHER,NJ,6/30/1930 20:00"
2,"Holyoke,,OVAL,CO,2/15/1931 14:00"
3,"Abilene,,DISK,KS,6/1/1931 13:00"
4,"New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00"


In [8]:
# read_csv
ufo = pd.read_csv("../data-basics/ufo.csv")
ufo.head()
# lol, I like modifying the dataset until it becomes what I want

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [9]:
ufo["location"] = ufo.City + ufo.State
ufo.head()
# WillingboroNJ/AbileneKS....... sucks

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,IthacaNY
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,WillingboroNJ
2,Holyoke,,OVAL,CO,2/15/1931 14:00,HolyokeCO
3,Abilene,,DISK,KS,6/1/1931 13:00,AbileneKS
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,New York Worlds FairNY


In [10]:
# again,make it right:
ufo["location"] = ufo.City + ','+ ufo.State
ufo.head()# I love the comma

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca,NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro,NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke,CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene,KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair,NY"


## 3. Commands end with parentheses


In [11]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('../data-basics/imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [12]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [13]:
movies.shape

(979, 6)

In [14]:
# example attribute: data type of each column
movies.dtypes # take care of the types

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [15]:
#  use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,The Girl with the Dragon Tattoo,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


Documentation for [**`describe`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html)

[<a href="#Python-pandas-Q&A-video-series-by-Data-School">Back to top</a>]

## 4. Rename columns in a DataFrame

In [17]:
# ufo dataset
ufo.columns

Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time',
       'location'],
      dtype='object')

In [20]:
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported','Shape Reported':'Shape_Reported'},inplace=True)
ufo.columns

Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time',
       'location'],
      dtype='object')

In [23]:
# Moreover,here if you want to overwrite the existing dataframe, then the inplace=true, if not  = true

In [26]:
# replace all,name the cols first:
ufo_cols = ['city','colors reported','shape reported','state','time','location']
ufo.columns =ufo_cols
ufo.head()
# Also, you can ufo = pd.read_csv('./data-basics/ufo.csv', header=0, names=ufo_cols)

Unnamed: 0,city,colors reported,shape reported,state,time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca,NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro,NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke,CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene,KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair,NY"



Documentation for [**`rename`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html)

In [29]:
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ','_')
ufo.columns

Index(['city', 'colors_reported', 'shape_reported', 'state', 'time',
       'location'],
      dtype='object')

In [38]:
ufo.head()

Unnamed: 0,shape_reported,time,location
2,OVAL,2/15/1931 14:00,"Holyoke,CO"
3,DISK,6/1/1931 13:00,"Abilene,KS"
4,LIGHT,4/18/1933 19:00,"New York Worlds Fair,NY"
5,DISK,9/15/1934 15:30,"Valley City,ND"
6,CIRCLE,6/15/1935 0:00,"Crater Lake,CA"


Documentation for [**`str.replace`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.replace.html)

## 5. Remove columns from a DataFrame:drop

#### ufo go on
#### remove a single column (axis=1 refers to columns)

In [39]:
ufo.drop('colors_reported',axis=1,inplace=True)
ufo.head()

KeyError: "['colors_reported'] not found in axis"

Documentation for [**`drop`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html)

In [40]:
#remove multiple columns at once
ufo.drop(['city','state'],axis=1,inplace=True)
ufo.head()

KeyError: "['city' 'state'] not found in axis"

#### remove multiple rows at once (axis=0 refers to rows)

In [37]:
ufo.drop([0,1],axis=0,inplace=True)
ufo.head()

Unnamed: 0,shape_reported,time,location
2,OVAL,2/15/1931 14:00,"Holyoke,CO"
3,DISK,6/1/1931 13:00,"Abilene,KS"
4,LIGHT,4/18/1933 19:00,"New York Worlds Fair,NY"
5,DISK,9/15/1934 15:30,"Valley City,ND"
6,CIRCLE,6/15/1935 0:00,"Crater Lake,CA"


In [41]:
# new way to drop rows: specify index
ufo.drop(index=[0, 1]).head()

KeyError: '[0 1] not found in axis'

In [47]:
# # old way to drop columns: specify labels and axis
# ufo.drop(['City', 'State'], axis=1).head()
# ufo.drop(['City', 'State'], axis='columns').head()

TypeError: 'Int64Index' object is not callable

In [48]:
# Ok let's see where we are and drop sth:
ufo.head()

Unnamed: 0,shape_reported,time,location
2,OVAL,2/15/1931 14:00,"Holyoke,CO"
3,DISK,6/1/1931 13:00,"Abilene,KS"
4,LIGHT,4/18/1933 19:00,"New York Worlds Fair,NY"
5,DISK,9/15/1934 15:30,"Valley City,ND"
6,CIRCLE,6/15/1935 0:00,"Crater Lake,CA"


In [50]:
ufo.drop(columns=['time','location']).head() # have fun

Unnamed: 0,shape_reported
2,OVAL
3,DISK
4,LIGHT
5,DISK
6,CIRCLE


- [More information](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#drop-now-also-accepts-index-columns-keywords)
- [Video: How do I remove columns from a pandas DataFrame?](https://www.youtube.com/watch?v=gnUKkS964WQ&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=6)

## 6. Read in only a subset

In [52]:
# firstly, we need the original dataset:
ufo = pd.read_csv('../data-basics/ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [53]:
# here we only need the columns state and the time: usecols:
ufo = pd.read_csv('../data-basics/ufo.csv',usecols=['City','State'])
ufo.head()

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY


In [55]:
# or equivalently, specify columns by position but you should know which columns they are:
ufo = pd.read_csv('../data-basics/ufo.csv',usecols=[0,3])
ufo.head()

Unnamed: 0,City,State
0,Ithaca,NY
1,Willingboro,NJ
2,Holyoke,CO
3,Abilene,KS
4,New York Worlds Fair,NY


In [56]:
# specify how many rows to read:
ufo = pd.read_csv("../data-basics/ufo.csv",nrows=3)
ufo

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00


Documentation for [**`read_csv`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html)

In [58]:
# let's check the column:
ufo.City

0         Ithaca
1    Willingboro
2        Holyoke
Name: City, dtype: object

In [60]:
for c in ufo.City:
    print(c)#iter through a series.
# how to iterate through a dataframe?

Ithaca
Willingboro
Holyoke


In [61]:
for index,row in ufo.iterrows():
    print(index,row.City,row.State)

0 Ithaca NY
1 Willingboro NJ
2 Holyoke CO


To preserve dtypes while iterating over the rows, it is better to use [**`itertuples()`**](http://https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples) which returns namedtuples of the values and which is generally faster than iterrows.



## 8. Drop all non-numeric columns from a DataFrame

In [63]:
drinks = pd.read_csv('../data-basics/drinks.csv')
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [64]:
# only include numeric columns in the DataFrame
import numpy as np

drinks.select_dtypes(include=[np.number]).dtypes

beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object

Documentation for [**`select_dtypes`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.select_dtypes.html)

## 9. Use string methods

In [67]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [70]:
# normal way to access string methods in Python
'hello'.upper

<function str.upper()>

####  string methods for pandas Series are accessed via 'str'

In [71]:
orders.item_name.str.upper().head()

0             CHIPS AND FRESH TOMATO SALSA
1                                     IZZE
2                         NANTUCKET NECTAR
3    CHIPS AND TOMATILLO-GREEN CHILI SALSA
4                             CHICKEN BOWL
Name: item_name, dtype: object

In [74]:
# string method 'contains' checks for a substring and returns a boolean Series
orders[orders.item_name.str.contains('Chicken')].head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
11,6,1,Chicken Crispy Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",$8.75
12,6,1,Chicken Soft Tacos,"[Roasted Chili Corn Salsa, [Rice, Black Beans,...",$8.75
13,7,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$11.25


In [76]:
# string methods can be chained together (delete the "[" and "]")
orders.choice_description.str.replace('[','').head()#ok we can make it at same time:

  orders.choice_description.str.replace('[','').head()#ok we can make it at same time:


0                                                  NaN
1                                          Clementine]
2                                               Apple]
3                                                  NaN
4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object

In [79]:
orders.choice_description.str.replace('[','').str.replace(']',"").head()

  orders.choice_description.str.replace('[','').str.replace(']',"").head()


0                                                  NaN
1                                           Clementine
2                                                Apple
3                                                  NaN
4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object

In [80]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [81]:
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()

  orders.choice_description.str.replace('[\[\]]', '').head()


0                                                  NaN
1                                           Clementine
2                                                Apple
3                                                  NaN
4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...
Name: choice_description, dtype: object

In [82]:
orders.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


[String handling section](http://pandas.pydata.org/pandas-docs/stable/api.html#string-handling) of the pandas API reference

[<a href="#Python-pandas-Q&A-video-series-by-Data-School">Back to top</a>]

## 10. Change the data type of a Series

In [83]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('../data-basics/drinks.csv')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [84]:
# examine the data type of each Series
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [91]:
# change the data type of an existing Series
# beer_servings                     int64-----------------------beer_servings                     float
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [97]:
# alternatively, change the data type of a Series while reading in a file
drikns = pd.read_csv('../data-basics/drinks.csv', dtype={'beer_sevings':float})
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [None]:
# why can not dtype=int?