In [None]:
import boto3
import pandas as pd
import seaborn as sns
import boto3
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import scipy.stats

In [None]:
df = pd.read_parquet('home_assignment_data.parquet')
len(df)

In [None]:
df.sample(2)

In [None]:
df['id'] = range(len(df))

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df[df['days_since_install'].notnull()]
len(df)

In [None]:
df['convergance'] = 100 * (df['clicks'] / df['impressions'])

In [None]:
df.sample(2)

In [None]:
sns.histplot(data=df, x="clicks", bins = range(0, 50) )

In [None]:
sns.boxenplot(x=df["clicks"])

In [None]:
sns.boxenplot(x=df["impressions"])

In [None]:
sns.histplot(data=df, x="impressions", bins = range(100, 2000, 25))

In [None]:
scipy.stats.kendalltau(df['impressions'], df['clicks'])

In [None]:
sns.histplot(data=df, x="convergance", stat="percent")

In [None]:
scipy.stats.kendalltau(df['convergance'], df['impressions'])


In [None]:
sns.histplot(data=df[df['impressions'] == 100], x="convergance", stat="percent")

In [None]:
sns.histplot(data=df, x="convergance", bins = range(0, 10))

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.groupby(['is_subscriber']).size()/len(df)

In [None]:
scipy.stats.kendalltau(df['convergance'], df['is_subscriber'])

In [None]:
sns.countplot(x="is_subscriber", data=df)

In [None]:
df['feature_types'] = df['feature_types'].apply(lambda features: [feature.strip().lower() for feature in features])

In [None]:
all_features = {feature for features in df['feature_types'] for feature in features}
len(all_features)

In [None]:
country_size = df.groupby(['country']).size().to_frame('size').reset_index()
single_template_countries = country_size[country_size['size'] == 1].shape[0]

single_template_countries, single_template_countries/len(pd.unique(df['country']))

In [None]:
sns.countplot(y="country", data=df, order=df.country.value_counts().iloc[:25].index)

In [None]:
sns.countplot(y="language", data=df,
              order=df.language.value_counts().iloc[:25].index)

In [None]:
df['language_clean'] = df['language'].apply(lambda language: language.split('-')[0])
len(pd.unique(df['language'])), len(pd.unique(df['language_clean']))

In [None]:
language_size = df.groupby(['language_clean']).size().to_frame('size').reset_index()
single_template_languages = language_size[language_size['size'] == 1].shape[0]

single_template_languages, single_template_languages/len(pd.unique(df['language_clean']))

In [None]:
sns.countplot(y="language_clean", data=df,
              order=df.language_clean.value_counts().iloc[:25].index)

In [None]:
df[df['days_since_install'] < 380].shape[0]/len(df)

In [None]:
sns.histplot(data=df, x="days_since_install", bins =[0] + list(range(1, 1000, 50)))

In [None]:
df[df['followers_count']  < 5].shape[0]/len(df)

In [None]:

sns.histplot(data=df, x="followers_count", bins = range(0, 50))

In [None]:
sns.countplot(x="has_facebook", data=df)

In [None]:
sns.countplot(x="has_youtube", data=df)

In [None]:
sns.countplot(x="has_instagram", data=df)

In [None]:
sns.countplot(x="has_tiktok", data=df)

In [None]:
sns.countplot(x="has_snapchat", data=df)

In [None]:
sns.countplot(x="has_profile_img", data=df)

In [None]:
np.median(df[['duration']])

In [None]:
sns.histplot(data=df, x="duration", bins = range(0, 150, 1))

In [None]:
sns.histplot(data=df, x="height")

In [None]:
sns.histplot(data=df, x="width")

In [None]:

df['number_of_feature_types'] = df['feature_types'].apply(len)
df['unique_feature_types'] = df['feature_types'].apply(set)


df['number_of_unique_feature_types'] = df['unique_feature_types'].apply(len)


In [None]:
np.max(df['number_of_feature_types'])

In [None]:
sns.histplot(data=df, x="number_of_feature_types", bins = range(0, 100, 1))

In [None]:
sns.histplot(data=df, x="number_of_unique_feature_types", bins = range(0, 30, 1))

In [None]:
def count_feature(feature_types):
    feature_to_count = defaultdict(lambda: 0.)
    
    for feature in feature_types:
        feature_to_count[feature] += 1
    return feature_to_count

def get_feature_duration(feature_types, features_duration):
    feature_to_duration = defaultdict(lambda: 0.)
    
    for (feature, duration) in zip(feature_types, features_duration):
        feature_to_duration[feature] += duration
    return feature_to_duration

def get_feature_min_start_time(feature_types, features_start_times):
    feature_to_min_start_time = defaultdict(lambda: -1)
    
    for (feature, start_time) in zip(feature_types, features_start_times):
        
        if feature in feature_to_min_start_time:
            continue 
            
        feature_to_min_start_time[feature] = start_time
        
    return feature_to_min_start_time


df['feature_count'] = df['feature_types'].apply(count_feature)
df['features_to_duration'] = df.apply(lambda r: get_feature_duration(r['feature_types'], r['feature_durations']), axis=1)
df['features_to_start_time'] = df.apply(lambda r: get_feature_min_start_time(r['feature_types'], r['start_times']), axis=1)


In [None]:
fig, axs = plt.subplots(len(all_features), 1, figsize=(5, 120))


for (i, feature) in enumerate(all_features):
    counts = df['feature_count'].apply(lambda feature_count: feature_count[feature]).tolist()
    
    
    _ = axs[i].hist(counts, bins = range(0, 11, 1))
    axs[i].title.set_text(feature)
    

plt.tight_layout()
plt.show()

    

In [None]:
fig, axs = plt.subplots(len(all_features), 1, figsize=(5, 120))


for (i, feature) in enumerate(all_features):
    durations = df['features_to_duration'].apply(lambda features_to_duration: features_to_duration[feature]).tolist()
    
    
    _ = axs[i].hist(durations)
    axs[i].title.set_text(feature)
    

plt.tight_layout()
plt.show()

    

In [None]:


fig, axs = plt.subplots(len(all_features), 1, figsize=(5, 120))


for (i, feature) in enumerate(all_features):
   
    start_times =  df['features_to_start_time'].apply(lambda features_to_start_time: features_to_start_time[feature]).tolist()
        
    
    start_times = [start_time for start_time in start_times if start_time != -1]
    
    
    _ = axs[i].hist(start_times)
    axs[i].title.set_text(feature)
    

plt.tight_layout()
plt.show()

    

In [None]:
sns.histplot(data=df, x="replaceable_count", bins =range(1, 100, 1))

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="replaceable_count", y="convergance", ax= ax)
plt.show()

In [None]:
sns.histplot(data=df, x="transitions_count", bins =range(1, 50, 1))

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="transitions_count", y="convergance", ax= ax)
plt.show()

In [None]:
df['post_hour'] = df['post_timestamp'].apply(lambda pt: pt.hour)
df['post_year'] = df['post_timestamp'].apply(lambda pt: pt.year)
df['post_month'] = df['post_timestamp'].apply(lambda pt: pt.month)
df['post_weekday'] = df['post_timestamp'].apply(lambda pt: pt.weekday())

In [None]:
sns.histplot(data=df, x="post_hour")

In [None]:
sns.histplot(data=df, x="post_month", bins = range(0, 13))

In [None]:
sns.histplot(data=df, x="post_weekday", bins=range(0, 8))

In [None]:
sns.histplot(data=df, x="impressions", bins =range(100, 1000, 25))

In [None]:
sns.histplot(data=df, x="clicks", bins =range(0, 100, 1))

In [None]:


sns.histplot(data=df, x="convergance", bins = range(0, 21))
         

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="impressions", y="clicks", ax= ax)
plt.show()

In [None]:
sns.scatterplot(data=df, x="days_since_install", y="convergance")

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="impressions", y="convergance", ax= ax)
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="followers_count", y="convergance", ax= ax)
ax.set_xlim(0, 1000)
plt.show()

In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df, x="duration", y="convergance", ax= ax)
plt.show()