# Welcome to the Ultimate Pandas Introduction Tutorial

[<img src="https://www.pinclipart.com/picdir/big/367-3678882_python-logo-clipart-easy-pandas-python-logo-png.png" width="250"/>](image.png)

Learn the basics on the pandas python library in this tutorial!


In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas_profiling import ProfileReport
import matplotlib.pylab as plt
plt.style.use('ggplot')
pd.set_option('max_columns', 200)

## Series

In [None]:
mydata = ['Boat', 'Car', 'Bike', 'Truck']

myseries1 = pd.Series(mydata)

In [None]:
print(myseries1)

In [None]:
mydata = [1, 55, 99, 43]
myseries2 = pd.Series(mydata)
print(myseries2)

## DataFrame

In [None]:
mydfdata = [('Boat', 1), ('Car', 55), ('Bike',99), ('Truck', 43)]

mydf = pd.DataFrame(mydfdata, columns=['thing', 'count'])

mydf

In [None]:
mydf.dtypes

# Reading in Data

In [None]:
df = pd.read_csv('MrBeast_youtube_stats.csv')

# Basic Inspect The Data

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['column name'].value_counts()

In [None]:
df['column name'].unique()

In [None]:
df_profile = ProfileReport(df)

##### Dealing with NULL

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
#Show sapcific column null vslues

df[df['column name'].isnull()]

In [None]:
## Repalce Null values with their mean value

count_mean = df['column name'].mean()
df['column name'].replace(np.nan, count_mean, inpIace=True)

##### Dealing with Duplicate

In [None]:
df.duplicated().sum()

In [None]:
#By default, it removes duplicate rows based on all columns.
df.drop_duplicates()

In [None]:
#To remove duplicates on specific column(s), use [subset].
df.drop_duplicates(subset=['column name'])

# Columns and Rows

In [None]:
df['viewCount']

In [None]:
df['viewCount'][0]

In [None]:
df.loc[4]

#### Set ID Columns as Index

In [None]:
df = df.set_index('id')

#### Update Values in the Column

In [None]:
#Here this will assign  1st 5 columns values = 10
df['column name'][0:5]=10

#### Rename and Drop Columns

In [None]:
df.rename(columns = {'Old column name' : 'New Column name'})

In [None]:
df.drop(['column name 1' , 'column name 2'], axis=1, inplace=True)

# Subsetting Data

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# Subsetting Columns
df = df[['title', 'description', 'publishTime',
         'duration_seconds', 'viewCount', 'likeCount',
         'commentCount']]

In [None]:
# Subsetting using loc
df_subset1 = df.loc[df['viewCount'] > 1_000_000]

In [None]:
df_subset2 = df.query('viewCount > 1000000')

In [None]:
df = df.loc[~df['likeCount'].isna()]

# Casting dtypes

In [None]:
df['viewCount'] = df['viewCount'].astype('int')
df['likeCount'] = df['likeCount'].astype('int')

#### Covert to datetime datatypes

In [None]:
df['publishTime'] = pd.to_datetime(df['publishTime'])

#### Extract Month Year Day With New Columns

In [None]:
df['month'] = df['publishTime'].dt.month
df['year'] = df['publishTime'].dt.year
df['day'] = df['publishTime'].dt.day

#### Edit to see our desire output in this column

In [None]:
df['publishTime']= df['publishTime'].dt.strftime('%m-%d-%y')

#### Covert STR to Numeric datatypes

In [None]:
df['likeCount'] = pd.to_numeric(df['likeCount'].astype('str'))

# Creating new column

In [None]:
df['like_to_view_ratio'] = df['likeCount'] / df['viewCount']

#### Apply-Lambda Functiom


In [None]:
df['season-status '] = df['season'].apply(lambda x: "Pleasant" if x==1 else ("Sunny" if x > 0 & x <=2 else "Hot"))

# Adding new Row

In [None]:
df_to_append = df.tail(1)

In [None]:
df_concat = pd.concat([df, df_to_append])

In [None]:
df = df.merge(credits,on='column name')

# Plot Examples

In [None]:
import matplotlib.pylab as plt
plt.style.use('ggplot')

In [None]:
df['viewCount'].plot(kind='hist', bins=50,
                     title='Distribution of View Count',
                     figsize=(15, 5))

In [None]:
df.plot(kind='scatter', x='viewCount', y='likeCount', title='View vs Like Count')

In [None]:
df.query('likeCount > 10000000')

## Save our output

In [None]:
df.to_csv('processed_data.csv')