In [57]:
import pandas as pd

## 1. Read a tabular data file

In [74]:
orders = pd.read_table("../data-basics/chipotle.tsv")
pd.DataFrame(orders).head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [75]:
users= pd.read_table("../data-basics/movie.user")
pd.DataFrame(users).head()

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101


In [76]:
# such a raw dataset, let's handle it with parameter values
user_cols = ['user_id','age','gender','occupation','zip_code'] # make the cols names
users= pd.read_table("../data-basics/movie.user",sep='|',names=user_cols,header=None) # sep
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## 2. Select a Series from a DataFrame

In [77]:
users.age

0      24
1      53
2      23
3      24
4      33
       ..
938    26
939    32
940    20
941    48
942    22
Name: age, Length: 943, dtype: int64

In [78]:
# Or equivalently, it is:
users["age"]

0      24
1      53
2      23
3      24
4      33
       ..
938    26
939    32
940    20
941    48
942    22
Name: age, Length: 943, dtype: int64

**Bracket notation** will always work, whereas **dot notation** has limitations:

- Dot notation doesn't work if there are **spaces** in the Series name
- Dot notation doesn't work if the Series has the same name as a **DataFrame method or attribute** (like 'head' or 'shape')
- Dot notation can't be used to define the name of a **new Series** (see below)

In [79]:
ufo = pd.read_table("../data-basics/ufo.csv")
ufo.head() # so sometimes it is not about the sep function, but the read function

Unnamed: 0,"City,Colors Reported,Shape Reported,State,Time"
0,"Ithaca,,TRIANGLE,NY,6/1/1930 22:00"
1,"Willingboro,,OTHER,NJ,6/30/1930 20:00"
2,"Holyoke,,OVAL,CO,2/15/1931 14:00"
3,"Abilene,,DISK,KS,6/1/1931 13:00"
4,"New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00"


In [80]:
# read_csv
ufo = pd.read_csv("../data-basics/ufo.csv")
ufo.head()
# lol, I like modifying the dataset until it becomes what I want

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [81]:
ufo["location"] = ufo.City + ufo.State
ufo.head()
# WillingboroNJ/AbileneKS....... sucks

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,IthacaNY
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,WillingboroNJ
2,Holyoke,,OVAL,CO,2/15/1931 14:00,HolyokeCO
3,Abilene,,DISK,KS,6/1/1931 13:00,AbileneKS
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,New York Worlds FairNY


In [82]:
# again,make it right:
ufo["location"] = ufo.City + ','+ ufo.State
ufo.head()# I love the comma

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca,NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro,NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke,CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene,KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair,NY"


## 3. Commands end with parentheses


In [83]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('../data-basics/imdb_1000.csv')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [84]:
movies.describe()

Unnamed: 0,star_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [85]:
movies.shape

(979, 6)

In [86]:
# example attribute: data type of each column
movies.dtypes # take care of the types

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [87]:
#  use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])

Unnamed: 0,title,content_rating,genre,actors_list
count,979,976,979,979
unique,975,12,16,969
top,Dracula,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


Documentation for [**`describe`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html)

[<a href="#Python-pandas-Q&A-video-series-by-Data-School">Back to top</a>]

## 4. Rename columns in a DataFrame

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,location
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00,"Ithaca,NY"
1,Willingboro,,OTHER,NJ,6/30/1930 20:00,"Willingboro,NJ"
2,Holyoke,,OVAL,CO,2/15/1931 14:00,"Holyoke,CO"
3,Abilene,,DISK,KS,6/1/1931 13:00,"Abilene,KS"
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00,"New York Worlds Fair,NY"
...,...,...,...,...,...,...
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00,"Grant Park,IL"
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00,"Spirit Lake,IA"
18238,Eagle River,,,WI,12/31/2000 23:45,"Eagle River,WI"
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45,"Eagle River,WI"
