In [None]:
import numpy as np
import pandas as pd
import matplotlb.pyplot as plt

**A. Data Analysis with Dataframe**

In [None]:
dataset_path = 'IMDB-Movie-Data.csv'
# Read data from .csv file
data = pd.read_csv(dataset_path)
# Data with choose 'Title' to do index
data_indexed = pd.read_csv(dataset_path, index_col='Title')

In [None]:
# Preview top 5 rows
data.head()

In [None]:
# Preview bottom 2 rows
data.tail(2)

In [None]:
# Getting info about data
data.info

In [None]:
# Overview data statistic
data.describe

In [None]:
# Shape of data
data.shape

**Data selection** - Indexing and Slicing data

In [None]:
# Extract data as series
genre = data['Genre']
genre

In [None]:
# Extract data as dataframe
data[['Genre']]

In [None]:
# Extract some columns
some_cols = data[['Title','Genre','Actors']]

In [None]:
# Extract some rows according columns
data.iloc[10:15][['Title','Genre','Actors']]

**Data selection** - Based on Conditional Filtering

In [None]:
data[((data['Year'] >= 2010) & (data['Year'] <= 2015))
      & (data['Rating'] <6.0)
      & (data['Revenue (Millions)'] > data['Revenue (Millions)'].quantile(0.95))]

Groupby Operations

In [None]:
data.groupby('Director')[['Rating']].mean().head()

**Sorting Operations**

In [None]:
data.groupby('Director')[['Rating']].mean().sort_values(['Rating'], ascending=False).head()

**View missing values**

In [None]:
# Check null values review
data.isnull().sum()

**Deal with missing values - Deleting**

In [None]:
# drop column
data.drop('Metascore', axis=1, inplace = True).head()

In [None]:
# drop rows
data.dropna()

**Deling with missing values - Filling**

In [None]:
revenue_mean = data_indexed['Revenue (Millions)'].mean()
print("The mean revenue is: ", revenue_mean)
# fill with this mean revenue
data_indexed['Revenue (Millions)'].fillna(revenue_mean, inplace=True)

**apply() function**

In [None]:
# Classify movies based on rating
def rating_group(rating):
  if rating >= 7.5:
    return 'Good'
  elif rating >=6.0:
    return 'Average'
  else:
    return 'Bad'
# creating a new variable in the dataset
data['Rating_category'] = data['Rating'].apply(rating_group)

data[['Title','Director','Rating','Rating_category']].head()

**B. Data Analysis with Time Series data**

**Import libraries and read dataset**

In [None]:
import pandas as pd

dataset_path = './opsd_germany_daily.csv'

# Read data
opsd_daily = pd.read_csv(dataset_path)

print(opsd_daily.shape)
print(opsd_daily.dtypes)
opsd_daily.head(3)

In [None]:
opsd_daily = opsd_daily.set_index('Date')
opsd_daily.head(3)

In [None]:
opsd_daily = pd.read_csv('./opsd_germany_daily.csv', index_col=0, parse_dates=True)

# Add col with year, month, and weekday name
opsd_daily['Year'] = opsd_daily.index.year
opsd_daily['Month'] = opsd_daily.index.month
opsd_daily['Weekday Name'] = opsd_daily.index.day_name()

# Display a random sampling of 5 rows
opsd_daily.sample(5, random_state=0)

**2. Time-based indexing**

In [None]:
opsd_daily.loc['2014-01-20':'2014-01-22']

In [None]:
opsd_daily.loc['2012-02']

In [None]:
import matplotlib.pyplot as plt
# Display figures inline

import seaborn as sns
#Use seaborn
sns.set(rc={'figure.figsize':(11, 4)})
opsd_daily['Consumtion'].plot(linewidth=0.5)

In [None]:
cols_plot = ['Consumption','Solar','Wind']
axes = opsd_daily[cols_plot].plot(marker='.', alpha=0.5, linestyle='None', figsize=(11,9), subplot=True)
for ax in axes:
  ax.set_ylabel('Daily Totals (GWh)')
plt.show()

**4. Seasonality**

In [None]:
fig, axes = plt.subplots(3,1, figsize=(11, 10), sharex=True)
for name, ax in zip(['Consumption', 'Solar', 'Wind'].axes):
  sns.boxplot(data=opsd_daily, x=['Month'], y=name, ax=ax)
  ax.set_ylabel('GWh')
  ax.set_title(name)
  #Remove the automatic x
  if ax != axes[-1]:
    ax.set_xlabel(' ')

**5. Frequencies**

In [None]:
pd.date_range('1998-03-10','1998-03-15', freq='D')

In [None]:
time_sample = pd.to_datetime(['2013-02-03','2013-02-06','2013-02-08'])
# Select the specified dates and just the consumption column
consum_sample = opsd_daily.loc[times_sample, ['Consumption']].copy()
consum_sample

In [None]:
# Convert the data to daily frequency
consum_freq = consum_sample.asfreq('D')
# Create a column with missing forward filled
consum_freq['Consumption Forward Fill'] = consum_sample.asfreq('D', method='ffill')
consum_freq

In [None]:
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
# Resample to weekly frequency
opsd_weekly_mean = opsd_daily[data_columns].resample('W').mean()
opsd_weekly_mean.head(3)

In [None]:
print(opsd_daily.shape[0])
print(opsd_weekly_mean.shape[0])

In [None]:
# Start and end of the date range to extract
start, end = '2017-01', '2017-06'
# Plot daily and weekly
fig, ax = plt.subplots()
ax.plot(opsd_daily.loc[start:end, 'Solar'], narker='.', linestyle=0.5, label='Daily')
ax.plot(opsd_weekly_mean.loc[start:end, 'Solar'], narker='o', markersize=8, linestyle='-', label='Weekly Mean Resample')
ax.set_ylabel('Solar Production (GVh)')
ax.legend()
plt.show()

In [None]:
# Compute the annual, sums, setting the value
# fewer than 360 days of data
opsd_annual = opsd_daily[data_columns].resample('&').sum(min_count=360)

#The default index
#
# to the year component
opsd_annual = opsd_annual.set_index(opsd_annual.index.year)
opsd_annual.index.name = 'Year'
# Compute the ratio of Wind+Solar
opsd_annual['Wind+Solar/Consumption'] = opsd_annual['Wind+Solar'] / opsd_annual['Consumption']
opsd_annual.tail(3)

In [None]:
# Plot
ax = opsd_annual.loc[2012:, 'Wind+Solar/Consumption'].plot.bar(color='CO')
ax.set_ylabel('Fraction')
ax.set_ylim(0, 0.3)
ax.set_title('Wind + Solar Share of Annual Electricity Consumption')
plt.xticks(rotation=0)

In [None]:
# Compute the centered 7-day rolling mean
opsd_7d = opsd_daily[data_columns].rolling(7, center=True).mean()
opsd_7d.head()

**8. Trends**

In [None]:
opsd_365d = opsd_daily[data_columns].rolling(window=365, center=True, min_periods=360).mean()
#Plot daily, 7-day rolling mean, and 365-day rolling mean time series
fig, ax = plt.subplots()
ax.plot(opsd_daily['Consumption'], marker='.', markersize=2, color='0.6', linestyle='None', label='Daily')
ax.plot(opsd_7d['Consumption'], linewidth=2, label='7-d Rolling Mean')
ax.plot(opsd_365d['Consumption'], color='0.2', linewidth=3, label='Trend (365-d Rolling Mean)')
# Set x-ticks to yearly interval and add legend
ax.xaxis.set_major_locator(ndates.YearLocator())
ax.legend()
ax.set_xlabel('Year')
ax.set_ylabel('Consumption (GWh)')
ax.set_title('Trends in Electricity Consumption')
plt.show()

In [None]:
# Plot 365-day rolling mean time series of wind and solar power
fig, ax = plt.subplots()
for nn in ['Wind', 'Solar', 'Wind+Solar']:
  ax.plot(opsd_365d[nn], label=nn)
  ax.xaxis.set_major_locator(ndates.YearLocator())
  ax.set_ylim(0, 400)
  ax.legend()
  ax.set_ylabel('Production (GWh)')
  ax.set_title('Trends in Electricity Production (365-d Rolling Means)')
plt.show()