Exploring Data with pandas: Intermediate

Introduction

In [1]:
import pandas as pd
import numpy as np
# read the data set into a pandas dataframe
f500 = pd.read_csv("f500.csv", index_col=0)
f500.index.name = None

# replace 0 values in the "previous_rank" column with NaN
f500.loc[f500["previous_rank"] == 0, "previous_rank"] = np.nan

f500_selection = f500[['rank', 'revenues', 'revenue_change']].head(5)

Reading CSV files with pandas

In [2]:
f500 = pd.read_csv("f500.csv")
f500.loc[f500["previous_rank"] == 0, "previous_rank"] = np.nan

Using iloc to select by interger position

In [3]:
fifth_row = f500.iloc[4]
company_value = f500.iloc[0, 0]

first_three_rows = f500.iloc[0:3]
first_seventh_row_slice = f500.iloc[[0, 6], 0:5]

Using pandas methods to create boolean masks

In [4]:
null_previous = f500.loc[:, "previous_rank"].isnull()
null_previous_rank = f500[null_previous]
null_previous_rank = null_previous_rank[["company","rank", "previous_rank"]]

Working with Integer Labels

In [5]:
null_previous_rank = f500[f500["previous_rank"].isnull()]
top5_null_prev_rank = null_previous_rank.iloc[:5]

Pandas Index Alignment

In [6]:
previously_ranked = f500[f500["previous_rank"].notnull()]
rank_change = previously_ranked["previous_rank"] - previously_ranked["rank"]
f500["rank_change"] = rank_change

Using boolean Operators

In [7]:
large_revenue = f500["revenues"] > 100000
negative_profits = f500["profits"] < 0
combined = large_revenue & negative_profits
big_rev_neg_profit = f500[combined]

brazil_venezuela = (f500["country"] == "Brazil") | (f500["country"] == "Venezuela")
brazil_venezuela = f500[brazil_venezuela]
tech_outside_usa = (f500["sector"] == "Technology") & ~(f500["country"] == "USA")
tech_outside_usa = f500[tech_outside_usa].head(5)

Sorting Values

In [8]:
top_japanese_employer = f500[f500["country"] == "Japan"].sort_values("employees", ascending=False)
top_japanese_employer = top_japanese_employer.iloc[0, 0]

Using Loops with Pandas

In [9]:
top_employer_by_country = {}
countries = f500["country"].unique()

for country in countries:
    employee_rank = f500[f500["country"] == country].sort_values("employees", ascending=False)
    best_employee = employee_rank.iloc[0]
    company = best_employee["company"]
    
    top_employer_by_country[country] = company

Challenge: Calculating Return on Assets by Country

In [10]:
profit = f500[f500["profits"].notnull()]["profits"]
assets = f500[f500["assets"].notnull()]["assets"]
roa = profit / assets

f500["roa"] = roa

top_roa_by_sector = {}

sectors = f500["sector"].unique()
for sector in sectors:
    roa_ranking = f500[f500["sector"] == sector].sort_values("roa", ascending=False)
    first_roa_ranking = roa_ranking.iloc[0]
    top_roa_by_sector[sector] = first_roa_ranking["company"]