# Pandas Basics

In [None]:
import pandas as pd
f500 = pd.read_csv('f500.csv',index_col=0)
f500.index.name = None


In [None]:
# check last rows of dataframe lake head()
f500.tail(2)

In [None]:
# check the type of data in the data frame
print(f500.dtypes)

In [None]:
f500.info()

##### Selecting single column - returns series object

In [None]:
##   df.loc[row_label, column_label]

rank_col = f500["Ceo"]
print(rank_col)   # will print column "rank" type of:  pandas.core.series.Series or called:   series object

rank_col = f500.loc[:, "Ceo"]
print(rank_col)   # will print column "rank" type of:  pandas.core.series.Series or called:   series object

#### Selecting multiple columns

In [None]:
selection = f500[["Sector","Hqcity"]] 

selectiion = f500.loc[:, ["Sector","Hqcity"]]

In [None]:
#Let's finish by using a a slice object with labels to select specific columns:

selectiion = f500.loc[:, "Sector": "Hqcity"]
print(selectiion[0:3])

In [None]:
# get single row by its index or indexes 
two_rows = f500.loc[[4,8]]  
print(type(two_rows))
print(two_rows)

# get slice of rows 
slice_rows = f500[1:4]
print(type(slice_rows))
print(slice_rows)


# only cells in 2 and 3 row for 2 specific columns
middle_companies = f500.loc[[1, 2], ["Sector","Industry"]]
print(middle_companies)

# section of data for specific rows and columns 
middle_companies = f500.loc[1:4, "Sector": "Hqcity"]

### Series methods


In [None]:
sectors = f500["Sector"]
print(type(sectors))

In [None]:
# shows count of each unique value in the specific column
sectors = f500["Sector"]
sectors_value_counts = sectors.value_counts()
print(sectors_value_counts)

In [None]:
# shows count of each unique value in the 2 specific column
sectors_industries = f500[["Sector","Industry"]]
print(type(sectors_industries))
si_value_counts = sectors_industries.value_counts()
print(si_value_counts)

In [None]:
# count the occurence of the specific data in the selected column or columns
sector = f500['Sector']
sector_counts = sector.value_counts()

media  = sector_counts["Media"]
IndustrialsChemicalsMedia = sector_counts[["Industrials","Chemicals","Media"]]

print("media",media )
print("IndustrialsChemicalsMedia",IndustrialsChemicalsMedia )

### Select by Label	Explicit Syntax	Shorthand Convention


- With loc[], the ending slice **is included.**
- With iloc[], the ending slice **is not included.**

#### Pandas is an extension of NumPy, it also supports vectorized operations.

In [None]:
import random

a = [ random.randint(0,9) for x in range(0,5)]

d = {'col1': [ random.randint(0,9) for x in range(0,5)], 'col2': [ random.randint(10,19) for x in range(0,5)]}
df = pd.DataFrame(data=d)

df.head()

In [None]:
selected = df.loc[:, 'col1']
print('before', selected)

print('after adddition', selected  + 100)

Just like with NumPy, we can use any of the standard Python numeric operators with series, including:
- series_a + series_b -      Addition
- series_a - series_b -      Subtraction
- series_a * series_b -      Multiplication (this is unrelated to the multiplications used in linear algebra).
- series_a / series_b -      Division

- Series.max()
- Series.min()
- Series.mean()
- Series.median()
- Series.mode()
- Series.sum()

In [None]:
# count only vount of the cpecific value in the column using chaining and
# .value_counts() method

media = f500["Sector"].value_counts().loc["Media"]
print(media)


Because series and dataframes are two distinct objects, they have their own unique methods. However, there are many times where both series and dataframe objects have a method of the same name that behaves in similar ways.

- Series.max()      and DataFrame.max()
- Series.min()      and DataFrame.min()
- Series.mean()     and DataFrame.mean()
- Series.median()   and DataFrame.median()
- Series.mode()     and DataFrame.mode()
- Series.sum()      and DataFrame.sum()

Unlike their series counterparts, dataframe methods require an axis parameter so we know which axis to calculate across. While you can use integers to refer to the first and second axis, pandas dataframe methods also accept the strings "index" and "columns" for the axis parameter

**index & axis=0** for columns 
**columns & axis=1** for rows

In [None]:
import random

a = [ random.randint(0,9) for x in range(0,5)]

d = {'col1': [ random.randint(0,9) for x in range(0,5)], 'col2': [ random.randint(10,19) for x in range(0,5)]}
df = pd.DataFrame(data=d)

medians = df[["col1", "col2"]].mean("index")  #
print(medians)
medians = df[["col1", "col2"]].median(axis=0) # same as above
print(medians)

**boolean indexing** in pandas


In [None]:
import random

d = {'col1': [ random.randint(1,2) for x in range(0,10)], 'col2': [ random.randint(1,2) for x in range(0,10)]}
df = pd.DataFrame(data=d)
#print(df)

one_bool = df['col1'] == 1  # create series with boolean values
print('Count of True in col1: ', one_bool.value_counts().loc[True])
print('Count of 1 in col1: ',df['col1'].value_counts().loc[1])
#print(one_bool)

# create a new dataframe only with 1 in the col1 
result = df.loc[one_bool]
print(result)

## Boolean indexing to find zero values and replace them

In [None]:
import random

d = {'col1': [ random.randint(0,1) for x in range(0,9)], 'col2': [ random.randint(0,1) for x in range(0,9)]}
df = pd.DataFrame(data=d)

# code in two lines replace 0 with 1000
one_bool = df['col1'] == 0
df.loc[one_bool,"col1"] = 1000
print('Count of 1000 in col1: ',df['col1'].value_counts().loc[1000])

# code in one line  replace 0 with 1000
df.loc[df["col2"] == 0,"col2"] = 1000
print('Count of 1000 in col2: ',df['col2'].value_counts().loc[1000])

print(df)

In [None]:
## Get top 2 most common numbers in the column
import random

d = {'rank': [ "rank_" + str(random.randint(0,2)) for x in range(0,9)], 'name': [ f"Name-{str(x + 1)}" for x in range(0,9)]}
df = pd.DataFrame(data=d)

top_two_nranks = df["rank"].value_counts().head(2)

print (top_two_nranks)

In [None]:
import random
import numpy as np

## get number of null values 
## replace 0 with np.nan

d = {'rank': [ random.randint(0,2) for x in range(0,9)], 'name': [ f"Name-{str(x + 1)}" for x in range(0,9)]}
df = pd.DataFrame(data=d)


df.loc[df["rank"] == 0, "rank"] = np.nan

number_of_nulls = df["rank"].isnull()
print(number_of_nulls)

#### Boolean arrays with omparison operators

In [None]:
# checking beds     
#     null_beds = dataset["beds"].isnull()
#     print (dataset.loc[null_beds, ["bed_type", "price", "beds"]])  # select specific columns where beds are nulll