# 💾 Datasets

## Retriever

In [1]:
import pandas as pd
import retriever as rt

In [None]:
rt.dataset_names()

In [3]:
data = rt.install_csv("gdp")

Progress:  43%|██████████████████████████▉                                   | 5000/11507 [00:00<00:00, 47808.03rows/s]

Installing gdp_gdp.csv


Progress: 100%|█████████████████████████████████████████████████████████████| 11507/11507 [00:00<00:00, 49554.35rows/s]


In [4]:
df = pd.read_csv(data.file.name)
df.head()

Unnamed: 0,country_name,country_code,year,value
0,Arab World,ARB,1968.0,25760680000.0
1,Arab World,ARB,1969.0,28434200000.0
2,Arab World,ARB,1970.0,31385500000.0
3,Arab World,ARB,1971.0,36426910000.0
4,Arab World,ARB,1972.0,43316060000.0


## Scikit-Learn

In [5]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [6]:
# load wine dataset from sklearn
wine = datasets.load_wine()

# load boston dataset as pandas dataframe
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## NLTK

In [7]:
import nltk
import pandas as pd

In [8]:
nltk.download(info_or_id="state_union", quiet=True)

True

In [9]:
w = nltk.corpus.state_union

In [10]:
w.raw()[:100]

"PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS\n \nApril 16, 1945\n\nMr. Spe"

In [11]:
words = nltk.word_tokenize(w.raw())
sw = nltk.corpus.stopwords.words("english")
cleaned = [w.lower() for w in words if w.isalpha() and w.lower() not in sw]
fd = nltk.FreqDist(cleaned)

In [12]:
df = pd.DataFrame(fd.items(), columns=["word", "frequency"])
df.sort_values(by="frequency", ascending=False).head(20)

Unnamed: 0,word,frequency
89,must,1568
250,people,1313
38,world,1194
519,new,1112
6,congress,1077
99,america,1077
54,us,1049
749,year,1039
626,government,943
741,years,829


## pydbgen

In [13]:
import os
import shutil

import pandas as pd
from pydbgen import pydbgen

In [14]:
# prep for existing datasets from pydbgen lib, required in local dir for execution below
files = ["Domains.txt", "US_Cities.txt"]
for f in files:
    shutil.copyfile(
        "{}/{}".format(os.path.dirname(pydbgen.__file__), f),
        "{}/{}".format(os.getcwd(), f),
    )

In [15]:
dbgen = pydbgen.pydb()
dbgen

<pydbgen.pydbgen.pydb at 0x28192bab6c8>

In [16]:
df = dbgen.gen_dataframe(
    25, fields=["name", "city", "phone", "license_plate", "email"],
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           25 non-null     object
 1   city           25 non-null     object
 2   phone-number   25 non-null     object
 3   license-plate  25 non-null     object
 4   email          25 non-null     object
dtypes: object(5)
memory usage: 1.1+ KB


## pandas-datareader

In [17]:
import pandas_datareader.data as web
import datetime

In [18]:
# https://pydata.github.io/pandas-datareader/stable/remote_data.html#remote-data-fred
gdp = web.DataReader('GDP', 'fred', datetime.datetime(2000, 1, 1), datetime.datetime(2005, 1, 1))
gdp.head()

Unnamed: 0_level_0,GDP
DATE,Unnamed: 1_level_1
2000-01-01,10002.857
2000-04-01,10247.679
2000-07-01,10319.825
2000-10-01,10439.025
2001-01-01,10472.879


In [19]:
# https://pydata.github.io/pandas-datareader/stable/remote_data.html#oecd
oecd_tud = web.DataReader('TUD', 'oecd', datetime.datetime(2012, 1, 1))
oecd_tud.head()

Country,Hungary,Hungary,Hungary,Hungary,Hungary,Hungary,Mexico,Mexico,Mexico,Mexico,...,Portugal,Portugal,Portugal,Portugal,Germany,Germany,Germany,Germany,Germany,Germany
Source,Administrative data,Administrative data,Administrative data,Survey data,Survey data,Survey data,Administrative data,Administrative data,Administrative data,Survey data,...,Administrative data,Survey data,Survey data,Survey data,Administrative data,Administrative data,Administrative data,Survey data,Survey data,Survey data
Series,Employees,Union members,Trade union density,Employees,Union members,Trade union density,Employees,Union members,Trade union density,Employees,...,Trade union density,Employees,Union members,Trade union density,Employees,Union members,Trade union density,Employees,Union members,Trade union density
Year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2012-01-01,,,,,,,,,,,...,60.1,,,,22940.0,8099.0,35.3,,,
2013-01-01,,,,,,,,,,,...,,,,,23366.0,8154.0,34.9,,,
2014-01-01,,,,,,,,,,,...,,,,,23107.0,8092.0,35.0,,,
2015-01-01,,,,,,,,,,,...,,,,,22755.0,7964.0,35.0,,,
2016-01-01,,,,,,,,,,,...,,,,,22768.0,7893.0,34.7,,,
