In [59]:
from datetime import datetime
from datetime import timedelta

import pandas
from sklearn.linear_model import LogisticRegression

In [35]:
df = pandas.read_csv("troon_instagram_clean_post_data.csv")
df = df.set_index("id")
df = df[df["release_post"] == True]
df = df[["post_month", "post_day", "post_year", "days_since_previous_release", "release_post"]].copy()
df["month"] = df["post_month"].apply(lambda x : datetime.strptime(x, '%B').month)
df = df.rename(columns={"post_year" : "year", "post_day" : "day"})
df["date"] = pandas.to_datetime(df[["year", "month", "day"]])
del df["month"]
del df["day"]
del df["year"]
del df["post_month"]
df

Unnamed: 0_level_0,days_since_previous_release,release_post,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
183,0.0,True,2017-02-17
197,9.0,True,2017-02-26
109,20.0,True,2017-03-18
57,6.0,True,2017-03-24
196,6.0,True,2017-03-30
...,...,...,...
271,5.0,True,2021-01-11
272,7.0,True,2021-01-18
273,7.0,True,2021-01-25
274,5.0,True,2021-01-30


In [36]:
df.query("release_post == True & days_since_previous_release.isnull()")

Unnamed: 0_level_0,days_since_previous_release,release_post,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [46]:
filled_in = [dict(df.iloc[0])]
# df should already be sorted by date
current_date = filled_in[0]["date"]
days_since_release = filled_in[0]["days_since_previous_release"] # should be 0
while current_date < df.iloc[-1]["date"]:
    days_since_release += 1
    current_date = current_date + timedelta(days=1)
    if len(df[df["date"] == current_date]) == 1:
        filled_in.append({"date" : current_date, "release_post" : True,
                          "days_since_previous_release" : days_since_release})
        days_since_release = 0
    else:
        filled_in.append({"date" : current_date, "release_post" : False,
                          "days_since_previous_release" : days_since_release})
        
features_df = pandas.DataFrame(filled_in)
features_df

Unnamed: 0,days_since_previous_release,release_post,date
0,0.0,True,2017-02-17
1,1.0,False,2017-02-18
2,2.0,False,2017-02-19
3,3.0,False,2017-02-20
4,4.0,False,2017-02-21
...,...,...,...
1446,3.0,False,2021-02-02
1447,4.0,False,2021-02-03
1448,5.0,False,2021-02-04
1449,6.0,False,2021-02-05


In [48]:
features_df["release_post"].value_counts()

False    1266
True      185
Name: release_post, dtype: int64

In [55]:
features_df["weekday"] = features_df["date"].apply(lambda x : x.weekday())
features_df["month"] = features_df["date"].apply(lambda x : x.month)
features_df

Unnamed: 0,days_since_previous_release,release_post,date,weekday,month
0,0.0,True,2017-02-17,4,2
1,1.0,False,2017-02-18,5,2
2,2.0,False,2017-02-19,6,2
3,3.0,False,2017-02-20,0,2
4,4.0,False,2017-02-21,1,2
...,...,...,...,...,...
1446,3.0,False,2021-02-02,1,2
1447,4.0,False,2021-02-03,2,2
1448,5.0,False,2021-02-04,3,2
1449,6.0,False,2021-02-05,4,2


In [57]:
del features_df["date"]

In [63]:
train_df = features_df.sample(n=int(len(features_df) * 0.75), random_state=1024)
test_df = features_df[~features_df.index.isin(train_df.index)]
print(len(train_df), len(test_df))

1088 363


In [99]:
lr = LogisticRegression(class_weight="balanced", random_state=1024)
lr.fit(train_df[["days_since_previous_release", "weekday"]], train_df["release_post"])

LogisticRegression(class_weight='balanced', random_state=1024)

In [100]:
test_labels = lr.predict(test_df[["days_since_previous_release", "weekday"]])

In [101]:
true_pos = 0
false_pos = 0
false_neg = 0
for (gold, predicted) in zip(list(test_df["release_post"]), test_labels):
    if gold == predicted and gold == True:
        true_pos += 1
    elif gold == True and predicted == False:
        false_neg += 1
    elif gold == False and predicted == True:
        false_pos += 1

In [102]:
# precision
true_pos / (true_pos + false_pos)

0.208955223880597

In [103]:
# recall
true_pos / (true_pos + false_neg)

0.7368421052631579