In [207]:
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean

import os

from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from oauth2client.client import GoogleCredentials

In [192]:
# Link of shared files... you care about the shared folder here.
url = 'https://drive.google.com/drive/folders/0B98qpkK5EJemYnJ1ajA1ZVJwMzg'
shared_folder = url.split('/')[-1]

In [194]:
# Here you are using the credentials secret json file to load the config setting to access 
# the Google Drive API.

creds = os.path.expanduser('~') + '/client_secret_574391720429-dpb8gk5gq13h87juvedat4muf2pesagj.apps.googleusercontent.com.json'

In [197]:
# Creating an instance of Google Authentication
gauth = GoogleAuth()

In [198]:
# Loading credentials file from local json.

gauth.LoadClientConfigFile(creds)

In [199]:
# Logging into Drive

drive = GoogleDrive(gauth)

In [200]:
# Asks you to sign in to verify. 

files = drive.ListFile({'q':f"'{shared_folder}' in parents and trashed=false"}).GetList()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=574391720429-dpb8gk5gq13h87juvedat4muf2pesagj.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [201]:
data_files = {file['title']:file['id'] for file in files}

In [202]:
data_files

{'full_train_set.csv': '1J96vAqyh92VIeh7kBFm1NBfZcvx8wp2s',
 'lc_loans.csv': '0B98qpkK5EJemRXpfa2lONlFRSms',
 'house_sales.csv': '0B98qpkK5EJemVTRRN0dLakxwTmM',
 'airline_stats.csv': '0B98qpkK5EJemMzZYZHZJaF9va0U',
 'loan_data.csv': '0B98qpkK5EJemZzdoQ2I3SWlBYzg',
 'loan3000.csv': '0B98qpkK5EJemQXYtYmJUVkdsN1U',
 'loan200.csv': '0B98qpkK5EJemd0JnQUtjb051dTA',
 'County_Zhvi_AllHomes.csv': '0B98qpkK5EJemWGRWOEhYN1RabVk',
 'LungDisease.csv': '0B98qpkK5EJemb25YYUFJZnZVSnM',
 'imanishi_data.csv': '0B98qpkK5EJemZTJnUDd5Ri1vRDA',
 'click_rates.csv': '0B98qpkK5EJemVHB0ZzdtUG9SeTg',
 'four_sessions.csv': '0B98qpkK5EJemOFdZM1JsaEF0Mnc',
 'web_page_data.csv': '0B98qpkK5EJemOC0xMHBTTEowYzg',
 'loans_income.csv': '0B98qpkK5EJemRXVld0NSbWhYNVU',
 'state.csv': '0B98qpkK5EJembFc5RmVKVVJPdGc',
 'kc_tax.csv': '0B98qpkK5EJemck5VWkszN3F3RGM',
 'dfw_airline.csv': '0B98qpkK5EJemcmZYX2VhMHBXelE',
 'sp500_data.csv': '0B98qpkK5EJemV2htZWdhVFRMNlU',
 'loanStats.csv': '0B98qpkK5EJemYlI2V2MxZGhXcGc',
 'sp500_sect

In [203]:
def download_drive_file(filename, directory):
    """
    Downloads a file locally to the current working
    directory from a Google Drive folder and returns 
    a string of the filename.     
    
    Args:
    
    filename:   The name of the file you want to download 
                from the Google Drive directory.
    
    directory:  The directory that contains the file you 
                want to download.
    """
    
    for file in directory:
        if filename == file['title']:
            # Downloads the content locally.
            file.GetContentFile(filename)    
            print(f'{filename} downloaded.')
            return filename
            
    return f'File {filename} not found.'

In [204]:
# Running the function inside a pd dataframe.
df = pd.read_csv(download_drive_file('state.csv', files))

state.csv downloaded.


In [242]:
df.Population.agg(np.mean)

6162876.3

In [240]:
trim_mean(df.Population, proportiontocut=0.1)

4783697.125

In [243]:
df.Population.agg(np.median)

4436369.5

In [244]:
def weighted_mean(mean, weight):
    tot_w = weight.agg(np.sum)
    return np.sum(mean * weight) / tot_w
    

In [245]:
weighted_mean(df['Murder.Rate'], df['Population'])

4.445833981123393

In [384]:
df.iloc[int(np.median(list(df.index)))]

State           Missouri
Population       5988927
Murder.Rate          6.6
Abbreviation          MO
Name: 24, dtype: object

In [439]:
def weighted_median(median, weight):
    tot_w = weight.agg(np.sum)
    dft = pd.DataFrame({median.name:median, weight.name:weight/tot_w}).sort_values(by=median.name).reset_index(drop=True)
    dft['cumsum'] = dft[weight.name].cumsum()
        
    return dft.loc[(dft['cumsum']<=.5)].iloc[-1,0]
    


In [440]:
weighted_median(df['Murder.Rate'], df['Population'])

4.4