In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact
import plotly.express as px

In [123]:
sns.set_theme(rc = {'figure.figsize': (8, 4)})

In [None]:
!unzip case_data_analyst_feb22.zip

In [None]:
df = pd.read_csv("dataset_ice.csv", sep="|")
df.tail()

In [None]:
df.iloc[21:24]

In [31]:
#Applying regex to convert string to valid date format
df['DATE'].replace(to_replace=r'( 00\.00\.)', value=' 00:00:', regex=True, inplace=True)

In [34]:
#Converting string to datetime column
df['DATE'] = pd.to_datetime(df['DATE'], format='%Y-%m-%d %H:%M:%S')

In [None]:
#Checking the dataframe info
df.info()

In [None]:
# Creating a list of the columns
col_list = list(df.columns)

In [None]:
# Analyzing the value proportion for each column
@interact(Col=col_list)
def proportion_count(Col):
  #return df.loc[:, Col].value_counts()
  return df.value_counts(subset=Col, normalize=True, dropna=False).reset_index(name="Proportion")

In [None]:
# 1. Hvordan har abonnementsutviklingen (stock) vært gjennom 2018?
df1_abo = df[df['DATA_SOURCE']=='Stock'].groupby(by=['DATE', 'DATA_SOURCE']).agg(COUNT = ('PRICEPLAN_ID', 'count')).reset_index()
df1_abo

In [None]:
# Plotting the Count Q1 over the year
sns.lineplot(x='DATE', y='COUNT', data=df1_abo)
plt.title('Count over the year')
plt.show()

In [None]:
# Plotting the Count Q1 with Plotly
fig1 = px.line(data_frame=df1_abo, x='DATE', y='COUNT',
               title='Abonnement', height=400, width=800)
fig1.show()

In [None]:
# 2. Hvordan har salgsutviklingen (aktiveringer) vært gjennom 2018?
df2_sal = df[df['DATA_SOURCE']=='Activation'].groupby(by=['DATE', 'DATA_SOURCE']).agg(COUNT = ('PRICEPLAN_ID', 'count')).reset_index()
df2_sal

In [None]:
# Plotting the Count Q2 over the year
sns.lineplot(x='DATE', y='COUNT', data=df2_sal)
plt.title('Count over the year')
plt.show()

In [None]:
# 3. Hvordan har churnutviklingen vært gjennom 2018?
df3_chu = df[df['DATA_SOURCE']=='Cancellation'].groupby(by=['DATE', 'DATA_SOURCE']).agg(COUNT = ('PRICEPLAN_ID', 'count')).reset_index()
df3_chu

In [None]:
# Plotting the Count Q3 over the year
sns.lineplot(x='DATE', y='COUNT', data=df3_chu)
plt.title('Count over the year')
plt.show()

In [None]:
# 4. Til hvilken konkurrent mister ice flest abonnenter?
df4_kon = df[df['DATA_SOURCE']=='Cancellation'].groupby(by=['DATE', 'DATA_SOURCE', 'OPERATOR']).agg(COUNT = ('PRICEPLAN_ID', 'count')).reset_index()
df4_kon['OPERATOR'] = df4_kon['OPERATOR'].astype(str)
df4_kon

In [None]:
# Checking the value counts Q4
df4_kon.value_counts(subset='OPERATOR', normalize=True).reset_index(name='KANSELLERINGER')

In [None]:
# Plotting the Count Q4 over the year
sns.scatterplot(x='DATE', y='COUNT', data=df4_kon[df4_kon['OPERATOR'].isin(['815.0', '832.0', '705.0'])], hue='OPERATOR')
plt.title('Count over the year')
plt.show()

In [None]:
# 5. Fra hvilken konkurrent vinner ice flest abonnenter?
df5_akt = df[df['DATA_SOURCE']=='Activation'].groupby(by=['DATE', 'DATA_SOURCE', 'OPERATOR']).agg(COUNT = ('PRICEPLAN_ID', 'count')).reset_index()
df5_akt['OPERATOR'] = df5_akt['OPERATOR'].astype(str)
df5_akt

In [None]:
# Checking the value counts
df5_akt.value_counts(subset='OPERATOR', normalize=True).reset_index(name='NYE ABONNENTER')

In [None]:
# Plotting the Count Q5 over the year
sns.scatterplot(x='DATE', y='COUNT', data=df5_akt[df5_akt['OPERATOR'].isin(['832.0', '815.0', '705.0'])], hue='OPERATOR')
plt.title('Count over the year')
plt.show()

In [None]:
#6. Hvor stor markedsandel hadde ice ved utgangen av 2018?


In [None]:
df[df['PRICEPLAN_ID'].isin(['INB-9918193-0'])]

In [None]:
# KRONER - #7a Hva mener du er fire viktige innsikter i datasettet uavhengig av de konkrete spørsmålene over?
df7_a = df.groupby(by=['DATE']).agg(KRONER = ('FEE', 'sum')).reset_index()
df7_a = df7_a.assign(MONTH = df7_a['DATE'].dt.month,
                     WEEKNUM = df7_a['DATE'].dt.isocalendar().week)
df7_a = df7_a.groupby(by=['MONTH']).agg(KRONER = ('KRONER', 'sum')).reset_index()
df7_a

In [None]:
# Plotting the Count Q7a over the year
sns.lineplot(x='MONTH', y='KRONER', data=df7_a)
plt.title('Sum over the year')
plt.show()

In [None]:
# DISCOUNT - #7b Hva mener du er fire viktige innsikter i datasettet uavhengig av de konkrete spørsmålene over?
df7_b = df.groupby(by=['DATE']).agg(DISCOUNT = ('DISCOUNT', 'sum')).reset_index()
df7_b = df7_b.assign(MONTH = df7_b['DATE'].dt.month,
                     WEEKNUM = df7_b['DATE'].dt.isocalendar().week)
df7_b = df7_b.groupby(by=['MONTH']).agg(DISCOUNT = ('DISCOUNT', 'sum')).reset_index()
df7_b = df7_b.assign(DISCOUNT = df7_b['DISCOUNT']*-1)
df7_b

In [None]:
# Plotting the Count Q7a over the year
sns.lineplot(x='MONTH', y='DISCOUNT', data=df7_b)
plt.title('Sum over the year')
plt.show()

In [None]:
# TIME_DAYS - #7c Hva mener du er fire viktige innsikter i datasettet uavhengig av de konkrete spørsmålene over?
sns.histplot(x='LIFETIME_DAYS', data=df[df['DATA_SOURCE']=='Stock'], bins=11, stat='percent')
plt.title('Histogram av LIFETIME_DAYS')
plt.show()

In [None]:
# MEGAS - #7d Hva mener du er fire viktige innsikter i datasettet uavhengig av de konkrete spørsmålene over?
df7_d = df.value_counts(subset='INCLUDED_MB', normalize=True, dropna=False).reset_index(name='Proportion')
df7_d['INCLUDED_MB'].astype('category')
df7_d

In [None]:
# Plotting the Q7d
sns.barplot(x='INCLUDED_MB', y='Proportion', data=df7_d)
plt.title('Popular Subscription plans')
plt.show()