In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Let's first load the data and see how it looks like.

In [55]:
df = pd.read_csv("/kaggle/input/the-office-dataset/the_office_series.csv", index_col=0)
df.head()

In [56]:
df.info()

The Date column is not in the right forma so let's change it to datetime.

In [57]:
df["Date"] = pd.to_datetime(df["Date"])

In [58]:
df.info()

Let's use the Date column as index of the dataset.

In [59]:
df.set_index('Date',inplace=True)

In [60]:
df.describe()

# Univariate Analysis

In [61]:
sns.distplot(df['Ratings'])
plt.axvline(x=np.mean(df['Ratings']), c='red', ls='--', label='mean')
plt.axvline(x=np.percentile(df['Ratings'],25), c='green', ls='--', label='Q1')
plt.axvline(x=np.percentile(df['Ratings'],75), c='grey', ls='--', label='Q3')
plt.legend()

In [62]:
df.hist()

In [63]:
df.columns

In [64]:
sns.countplot(df['Season'])

In [65]:
ax1 = sns.lineplot(x=df.groupby(df.index).mean().index, 
             y=df.groupby(df.index).mean()['Votes'])
plt.title("Variation of the votes over the time")
plt.show()

In [66]:
ax2 = sns.lineplot(x=df.groupby(df.index).mean().index, 
             y=df.groupby(df.index).mean()['Viewership'])
plt.title("Variation of the Viewership over the time")

plt.show()

# Multivariate Analysis

In [67]:
sns.violinplot(x=df.Season, y=df.Ratings)

In [68]:
top_director = df['Director'].value_counts().head(15)

plt.figure(figsize=(10,6))
plt.tick_params(labelsize=15)
plt.title('Top 15 of Directors', fontsize = 25)
sns.barplot(y=top_director.index, x=top_director, orient='h')

In [69]:

top_director = df['Director'].value_counts().head(15)
top_writers = df['Writers'].value_counts().head(15)

#set seaborn plotting aesthetics as default
sns.set()

#define plotting region (1 row, 2 columns)
#fig, axes = plt.subplots(1, 2)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
#create boxplot in each subplot
sns.barplot(data=df, y=top_director.index, x=top_director, orient='h', ax=axes[0])
sns.barplot(data=df, y=top_writers.index, x=top_writers, orient='h',ax=axes[1])


In [70]:
df['Ratings'].mean()

In [71]:
def partition(x):
    if x < 8.2:
        return 0 #zéro note négative
    return 1 #1 note positive
df['Ratings']=df['Ratings'].map(lambda cw : partition(cw) ) 
    
# checking the review score now
df.Ratings.value_counts()

In [72]:
#counting the review score with 1 and 0
notes = df.Ratings.value_counts()

#calculating the percentage of each review type
print("Total positives ratings:", notes[1], ", (", (notes[1]/(notes[1]+notes[0]))*100,"%)")
print("Total négatives ratings:", notes[0], ", (", (notes[0]/(notes[1]+notes[0]))*100,"%)")
print('\n')

#plotting bar-plot and pie chart
%matplotlib inline
sns.set_style("whitegrid")
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.ylabel('Total ratings')
plt.xlabel('Label')
plt.title('Negatives ratings Vs Positives ratings',color='dimgrey')
plt.xticks([10,10.20],['0','1'])
#creating bar plots
plt.bar(10,14112, color = 'grey', width = 0.15,alpha=0.7,label='negative',edgecolor='black')
plt.bar(10.20,83143,color = '#2e4884', width = 0.15,alpha=0.9,label='positive',edgecolor='black')
plt.legend()

plt.subplot(1,2,2)
labels = ['Positive','Negative']
sizes = [83143,14112]
explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. 'Hogs')
color={'#2e4884','grey'}
plt.pie(sizes,explode=explode ,colors=color,labels=labels, autopct='%1.1f%%',shadow=False, startangle=0,radius=1.5,labeldistance=1.1,textprops={'fontsize': 14},frame=True, )
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Ratings pie chart',color='dimgrey')
plt.show()