<a href="https://colab.research.google.com/github/daniela500/colab_intro/blob/master/colab_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Five ways to read your data into a colab notebook

[Keyboard shortcuts for colab](https://medium.com/@tuewithmorris/google-colab-notebooks-keyboard-shortcuts-aa6a008fb91b)

In [0]:
import pandas as pd
import numpy as np

#### Method 1. Read data directly from a URL.

In [0]:
# this is the simplest way to read in data
# source: # https://opendata.dc.gov/datasets/bike-trails/data
bikes=pd.read_csv('https://opendata.arcgis.com/datasets/e8c2b7ef54fb43d9a2ed1b0b75d0a14d_4.csv') 
bikes.head()

Unnamed: 0,OBJECTID,LENGTH,NAME,STATUS,MAINTENANC,Shape_Length,MILES,ROUTEID
0,1,827.476,National Mall Trails,Open,NPS,252.214952,0.0,
1,2,19225.404,Capital Crescent Trail,Open,,5859.91494,0.0,
2,3,25404.687,Rock Creek Trail,Open,NPS,7743.36424,0.0,
3,4,6201.712,Metropolitan Branch Trail,Open,DDOT,1890.28563,0.0,
4,5,12625.091,Watts Branch Trail,Open,DDOT,3848.135455,0.0,


In [0]:
# Sometimes you have to supply the column heads.
column_headers = ['name', 'landmass', 'zone', 'area', 'population', 'language', 
                  'religion', 'bars', 'stripes', 'colours', 'red', 'green', 
                  'blue', 'gold', 'white', 'black', 'orange', 'mainhue', 
                  'circles', 'crosses', 'saltires', 'quarters', 'sunstars', 
                  'crescent', 'triangle', 'icon', 'animate', 'text', 'topleft', 
                  'botright']
flag_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data'
flags = pd.read_csv(flag_data_url, header=None, names=column_headers)
print(flags.shape)
flags.head()

In [0]:
# Let's try another example.
url='https://opendata.arcgis.com/datasets/2e65fc16edc3481989d2cc17e6f8c533_54.csv'
museums=pd.read_csv(url)
museums.sample(2)
# source: https://opendata.dc.gov/datasets/museums-in-dc/data

Unnamed: 0,X,Y,OBJECTID,NAME,ALT_NAME,LABEL,MAR_MATCHADDRESS,MAR_XCOORD,MAR_YCOORD,MAR_LONGITUDE,MAR_LATITUDE,MARID
80,-77.030866,38.908176,81,MARY BETHUNE COUNCIL HOUSE,BETHUNE MEMORIAL MUSEUM,Mary Bethune Council House,1318 VERMONT AVENUE NW,397323.1,137812.47,-77.030863,38.908168,225385
0,-77.009038,38.889802,1,EXHIBITION HALL AT THE U.S. CAPITOL VISITOR CE...,U.S. CAPITOL VISITOR CENTER,U.S. Capitol Visitor Center,,399216.07,135772.41,-77.009036,38.889794,294394


### Method 2. Read a file that is already in colab.

In [0]:
# So colab comes with some built in files.
cali = pd.read_csv('sample_data/california_housing_train.csv')
cali.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0


In [3]:
# you can also read a json file. Notice the difference in pandas method.
anscombe=pd.read_csv('sample_data/anscombe.json')
anscombe=pd.read_json('sample_data/anscombe.json')
anscombe.head()

Unnamed: 0,Series,X,Y
0,I,10,8.04
1,I,8,6.95
2,I,13,7.58
3,I,9,8.81
4,I,11,8.33


In [0]:
# Learn how to navigate your server 
# one way is to use Shell commands

In [5]:
# bellow are some shell commands that are useful 
# what is my current directory
! pwd
# list the files in the current directory
! ls

/content
sample_data


In [0]:
# what is the parent directory
 

In [9]:
# another way is to use python commands 

from pathlib import Path
# get the current working directory
Path.cwd()
home = Path.cwd()
print(home)
print(home.parent)

/content


PosixPath('/')

In [0]:
# pathlib allows us to build path and save as variable
# What's the parent dir?
home = Path.cwd()
print(home.parent)
data_dir = Path.joinpath(home, 'sample_data')
data_dir

/


PosixPath('/content/sample_data')

In [0]:
# List the contents of that folder using os
import os
os.listdir(data_dir)

['anscombe.json',
 'README.md',
 'california_housing_test.csv',
 'mnist_train_small.csv',
 'california_housing_train.csv',
 'mnist_test.csv']

In [0]:
# Use that to make a list of files.
files = list(os.listdir(data_dir))
files[2]

'california_housing_test.csv'

In [10]:
# can read the file directly if specify the path to the file or like bellow
df = pd.read_csv('sample_data/california_housing_test.csv')
df.shape

(3000, 9)

In [0]:
# Now you can read that into colab using pandas
file_path = Path.joinpath(data_dir, files[2])
print(file_path)
df=pd.read_csv(file_path)
df.head()

/content/sample_data/california_housing_test.csv


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


### Method 3. Upload a file to colab.
Reminder, uploaded files will get deleted when this runtime is recycled.

In [11]:
# You can also use the manual upload GUI over on the left of your screen.
from google.colab import files
uploaded = files.upload()

Saving winequality-red.csv to winequality-red.csv


In [0]:
df = pd.read_csv('abalone.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


###  Method 4. Read a file that's saved on github
https://github.com/austinlasseter/dash-virginia-counties

In [0]:
# useful for reading raw data from github.
url='https://raw.githubusercontent.com/austinlasseter/dash-virginia-counties/master/resources/acs2017_county_data.csv'
va=pd.read_csv(url)
print(va.shape)
va.describe()
# Source: https://github.com/austinlasseter/dash-virginia-counties/blob/master/resources/acs2017_county_data.csv

In [15]:
url2 = 'https://raw.githubusercontent.com/daniela500/colab_intro/master/data/titanic.csv'
titanic = pd.read_csv(url2)
titanic.shape
titanic.describe()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Age,Fare
count,712.0,712.0,712.0,712.0,712.0
mean,447.589888,0.404494,2.240169,29.642093,34.567251
std,258.683191,0.491139,0.836854,14.492933,52.938648
min,0.0,0.0,1.0,0.42,0.0
25%,221.75,0.0,1.0,20.0,8.05
50%,444.0,0.0,2.0,28.0,15.64585
75%,676.25,1.0,3.0,38.0,33.0
max,890.0,1.0,3.0,80.0,512.3292


In [18]:
url3 = 'https://raw.githubusercontent.com/daniela500/colab_intro/master/data/chipotle.tsv'
burritos = pd.read_csv(url3, sep='\t')
# burritos = pd.read_csv(url3, delimiter='\t') -- '\t' stands for tab
burritos.shape

(4622, 5)

### Method 5. Read a zip file.

In [21]:
# use the 'bang' for bash shell scripting.
! pwd
! ls

/content
LoanStats_2018Q4.csv.zip  sample_data  winequality-red.csv


In [20]:
# the 'wget' command reads content from the web.
!wget https://resources.lendingclub.com/LoanStats_2018Q4.csv.zip

--2019-11-12 00:39:52--  https://resources.lendingclub.com/LoanStats_2018Q4.csv.zip
Resolving resources.lendingclub.com (resources.lendingclub.com)... 64.48.1.20
Connecting to resources.lendingclub.com (resources.lendingclub.com)|64.48.1.20|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘LoanStats_2018Q4.csv.zip’

LoanStats_2018Q4.cs     [           <=>      ]  21.67M  1.86MB/s    in 12s     

2019-11-12 00:40:04 (1.85 MB/s) - ‘LoanStats_2018Q4.csv.zip’ saved [22727580]



In [22]:
# how do you unzip a .zip file in bash?
!unzip LoanStats_2018Q4.csv.zip
!ls

Archive:  LoanStats_2018Q4.csv.zip
  inflating: LoanStats_2018Q4.csv    
LoanStats_2018Q4.csv	  sample_data
LoanStats_2018Q4.csv.zip  winequality-red.csv


In [0]:
# Now you can read that csv file into pandas.
loans = pd.read_csv('LoanStats_2018Q4.csv', skiprows=1, skipfooter=2, engine='python')
loans.shape

(128412, 144)

In [25]:
## Footnote: You can also save files to csv (or other formats like pickle)
cali = pd.read_csv('sample_data/california_housing_test.csv')
cali.head()
cali.to_pickle('new_cali.pkl')
df1 = pd.read_pickle('new_cali.pkl')
df1.head()
#loans.to_pickle('loans2.pkl')
#new=pd.read_pickle('loans2.pkl')
#new.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [26]:
# You can also save to csv, but remember to skip the index.
cali.to_csv('new_cali.csv', index=False)
df2=pd.read_csv('new_cali.csv')
df2.shape

(3000, 9)