In [16]:
# installing libraries
!pip install gspread==3.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
#mounts your personal drive on this notebook
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Importing required libraries
import pandas as pd
import numpy as np
import gspread
from gspread_dataframe import set_with_dataframe

drive_path = 'drive/MyDrive/colab_notebooks/data'

# Sets the value for the service account in the credentials file .json
gc = gspread.service_account(filename=f'{drive_path}/gcp-key-API.json')
sh = gc.open_by_key('1bBElf5zCAbocg94QLsWNgr4-_sibscpcRwKZ5xnxHqg')

### Downloading data to drive

The following lines of code downloads the data that we will use in this notebook to our /content/ folder in google drive, by using the linux command [wget](https://shapeshed.com/unix-wget/#:~:text=What%20is%20the%20wget%20command,for%20downloads%20and%20viewing%20headers.), directly from the [UCI Machine Learning repository](https://archive.ics.uci.edu/ml/index.php)

In [None]:
!wget -c https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names

--2022-06-27 12:59:01--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5229 (5.1K) [application/x-httpd-php]
Saving to: ‘adult.names’


2022-06-27 12:59:02 (154 MB/s) - ‘adult.names’ saved [5229/5229]



In [None]:
!wget -c https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

--2022-06-27 12:59:06--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘adult.data’


2022-06-27 12:59:07 (7.38 MB/s) - ‘adult.data’ saved [3974305/3974305]



#### Reading the metadata file

In [None]:
!cat adult.names

| This data was extracted from the census bureau database found at
| http://www.census.gov/ftp/pub/DES/www/welcome.html
| Donor: Ronny Kohavi and Barry Becker,
|        Data Mining and Visualization
|        Silicon Graphics.
|        e-mail: ronnyk@sgi.com for questions.
| Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
| 48842 instances, mix of continuous and discrete    (train=32561, test=16281)
| 45222 if instances with unknown values are removed (train=30162, test=15060)
| Duplicate or conflicting instances : 6
| Class probabilities for adult.all file
| Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
| Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
|
| Extraction was done by Barry Becker from the 1994 Census database.  A set of
|   reasonably clean records was extracted using the following conditions:
|   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
|
| Prediction task is to determine whether a person makes over

#### Reading data file

In [None]:
!head adult.data

39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
31, 

In [None]:
#Creating header for dataframe
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex',
        'capital-gain','capital-loss','hours-per-week','native-country','earning_potential']

#### Storing dataset in pandas dataframe

In [None]:
df = pd.read_csv('/content/adult.data',names = cols)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,earning_potential
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                32561 non-null  int64 
 1   workclass          32561 non-null  object
 2   fnlwgt             32561 non-null  int64 
 3   education          32561 non-null  object
 4   education-num      32561 non-null  int64 
 5   marital-status     32561 non-null  object
 6   occupation         32561 non-null  object
 7   relationship       32561 non-null  object
 8   race               32561 non-null  object
 9   sex                32561 non-null  object
 10  capital-gain       32561 non-null  int64 
 11  capital-loss       32561 non-null  int64 
 12  hours-per-week     32561 non-null  int64 
 13  native-country     32561 non-null  object
 14  earning_potential  32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [None]:
df_describe = df.describe()
df_describe

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


#### Sending data to spreadsheet

In [18]:
#Sets the variable `ws` to store the sheet Id were we´d need to send data
#the parameter 0 inside get_worksheet refers to the first sheet in the workbook
ws = sh.get_worksheet(0)
#row number in the spreadsheet
row = 1
#column number in the spreadsheet
col = 1
#Writes the dataframe using the set_with_dataframe method
set_with_dataframe(ws,df,row,col,include_column_header=True)

In [None]:
#This chunk of code sends the df.describe table to the spreadsheet, which basically is the same that is 
#calculate using excel statistics functions 
ws = sh.get_worksheet(1)
#row number in the spreadsheet
row = 4 
#column number in the spreadsheet
col = 7
#Writes the dataframe using the set_with_dataframe method
set_with_dataframe(ws,df_describe,row,col,include_column_header=True)