# Pandas

Pandas is a powerful library for data manipulation and analysis in Python. It is widely used in a range of fields |  including data science |  finance |  and statistics.

In [6]:
from pathlib import Path
from datetime import datetime
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

pd.set_option('display.max_rows', None)
InteractiveShell.ast_node_interactivity = "all"

## 009. Linear regression in a SQLite DB

Implementation of this GitHub ticket https://github.com/gotofritz/yarkie/issues/10

## 009.000 Assets and globals

009.sqlit.db:

| rowid | ranking | id          | view_count | comment_count | like_count | upload_date | comment_count_estimated |
|-------|---------|-------------|-----------|----------------|------------|-------------|-------------------------|
| 505   | 36      | X-uJtV8ScYk | 334895639  | 2700000       | 5985543    | 20200914    | 0                       |
...


In [7]:

DB = Path("009_sqlite.db")
TARGET_COLUMN = "comment_count_estimated"

## 009.001 Read data

1. Read columns id, view_count, like_count, upload_date, comment_count, last_updated, TARGET_COLUMN in table videos from DB into a dataframe
    1. No need for an index
    1. Print the head
1. Massage the data
    1. `upload_date` is a string but not always in constant format. Use pd builtin function to turn into a consistent datetime
    1. Create a new column `days_since_upload` which is the difference between today and `upload_date`
    1. Create three columns, `v`, `l`, `c` which are the three `xxxx_count` divided by `days_since_upload`
    1. copy `comment_count` to TARGET_COLUMN
    1. Print the head, sorting by index
1. Create `df_missing_data` and `df_non_missing_data` with rows with `comment_count` = 0  or > 0 respectively
    1. Print the head for both, sorting by index

In [35]:
import sqlite3 as sqlite

# id, view_count, like_count, upload_date, comment_count, last_updated, {TARGET_COLUMN}

# solution
1, "SQL"
con = sqlite.connect(DB)
df = pd.read_sql(f"select id, view_count, like_count, upload_date, comment_count, last_updated, {TARGET_COLUMN} from videos", con=con)
df.head()

2, "MASSAGED"
df["upload_date"] = pd.to_datetime(df['upload_date'], format='mixed')
df["days_since_upload"] = (datetime.today() - df["upload_date"]).dt.days
df["v"] = df["view_count"] / df["days_since_upload"]
df["l"] = df["like_count"] / df["days_since_upload"]
df["c"] = df["comment_count"] / df["days_since_upload"]
df[TARGET_COLUMN] = df["comment_count"]
df.sort_index().head()

3, "MISSING / NON MISSING"
df_missing_data = df[df[TARGET_COLUMN] == 0]
df_missing_data.sort_index().head()
df_non_missing_data = df[df[TARGET_COLUMN] != 0]
df_non_missing_data.sort_index().head()

(1, 'SQL')

Unnamed: 0,id,view_count,like_count,upload_date,comment_count,last_updated,comment_count_estimated
0,Tis5Tm7PAwM,7831,800,20230913,95,2023-12-06T09:08:28.887397,95
1,PGDSawOwHkw,597,22,20230131,0,2023-12-06T09:08:28.887195,88
2,4ID2bOYTj94,774,13,20230131,0,2023-12-06T09:08:28.887195,88
3,3f1lgHaldYY,681,9,20230131,0,2023-12-06T09:08:28.887195,88
4,kbax_kx380o,422,8,20230131,0,2023-12-06T09:08:28.887195,88


(2, 'MASSAGED')

Unnamed: 0,id,view_count,like_count,upload_date,comment_count,last_updated,comment_count_estimated,days_since_upload,v,l,c
0,Tis5Tm7PAwM,7831,800,2023-09-13,95,2023-12-06T09:08:28.887397,95,84,93.22619,9.52381,1.130952
1,PGDSawOwHkw,597,22,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,1.932039,0.071197,0.0
2,4ID2bOYTj94,774,13,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,2.504854,0.042071,0.0
3,3f1lgHaldYY,681,9,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,2.203883,0.029126,0.0
4,kbax_kx380o,422,8,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,1.365696,0.02589,0.0


(3, 'MISSING / NON MISSING')

Unnamed: 0,id,view_count,like_count,upload_date,comment_count,last_updated,comment_count_estimated,days_since_upload,v,l,c
1,PGDSawOwHkw,597,22,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,1.932039,0.071197,0.0
2,4ID2bOYTj94,774,13,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,2.504854,0.042071,0.0
3,3f1lgHaldYY,681,9,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,2.203883,0.029126,0.0
4,kbax_kx380o,422,8,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,1.365696,0.02589,0.0
5,QWqsz25NpsU,360,6,2023-01-31,0,2023-12-06T09:08:28.887195,0,309,1.165049,0.019417,0.0


Unnamed: 0,id,view_count,like_count,upload_date,comment_count,last_updated,comment_count_estimated,days_since_upload,v,l,c
0,Tis5Tm7PAwM,7831,800,2023-09-13,95,2023-12-06T09:08:28.887397,95,84,93.22619,9.52381,1.130952
6,YUX8fUrKRNU,739128,7515,2023-01-30,278,2023-12-06T09:08:28.887397,278,310,2384.283871,24.241935,0.896774
7,tuqs6clFBRE,23536,329,2022-12-07,16,2023-12-06T09:08:28.887397,16,364,64.659341,0.903846,0.043956
8,ukJXPUODzcM,186808,2750,2022-10-25,77,2023-12-06T09:08:28.887397,77,407,458.987715,6.756757,0.189189
9,gVPUAntzOl4,932226,13527,2022-10-03,367,2023-12-06T09:08:28.887397,367,429,2173.020979,31.531469,0.855478


## 009.003 Guestimate the data

1. Import the scikit-learn library for linear regression
1. Print the head of `df_missing_data`, sorted by index
1. Create a model
    1. Use the v, c, l columns to train it with data from non_missing_data
    1. Use the model to estimate the missing data and put it in missing_data.
        - TIP: the model will not return a df, so you can't just do `df_missing_data['c'] = ...`
1. Overwrite the TARGET_COLUMN in `df_missing_data` with the guessed values by multiplying `c` by `days_since_upload`
    - Turn them to int, but round them first
    - TIP: same, you can't set `df_missing_data[TARGET_COLUMN] = ...`
1. Print the head, sorting by index

In [9]:
# solution

1
from sklearn.linear_model import LinearRegression

2
df_missing_data.sort_index().head()

3
regression_model = LinearRegression()
regression_model.fit(
    df_non_missing_data[['v', 'l']],
    df_non_missing_data['c']
)
df_missing_data.loc[:, 'c'] = regression_model.predict(
    df_missing_data[['v', 'l']]
)

4
df_missing_data.loc[:, TARGET_COLUMN] = (
    df_missing_data["c"] * df_missing_data["days_since_upload"]
).round().astype(int)

5
df_missing_data.sort_index().head()



1

2

3

Unnamed: 0,id,view_count,like_count,upload_date,comment_count,comment_count_estimated,days_since_upload,v,l,c
1,PGDSawOwHkw,597,22,2023-01-31,0,88,308,1.938312,0.071429,0.284407
2,4ID2bOYTj94,774,13,2023-01-31,0,88,308,2.512987,0.042208,0.284659
3,3f1lgHaldYY,681,9,2023-01-31,0,88,308,2.211039,0.029221,0.284664
4,kbax_kx380o,422,8,2023-01-31,0,88,308,1.37013,0.025974,0.284519
5,QWqsz25NpsU,360,6,2023-01-31,0,88,308,1.168831,0.019481,0.284512


## 009.004 Update the DB

1. Create a list of records (a new var) in the format `{'id': 'PGDSawOwHkw', '{TARGET_COLUMN}': 438}, ...` 
    1. Start from `df_missing_data`
    1. Print last 5 records
    1. Add `df_non_missing_data`
    1. Print last 5 records
1. Write the data into the db
    1. Initialise the DB in sqlite_utils
    1. Print the first 5 records by using the the `videos` table 'rows' generator 
    1. assert that the number of records with are going to update is the same is the count of all records in that table
    1. Use `upsert_all` to update records (there is no `update_add`). Don't forget the pk
    1. Print the first 5 records by using the the `videos` table 'rows' generator 

In [27]:
from sqlite_utils import Database

# solution

1
df_missing_data.loc[:, "last_updated"] = pd.Timestamp.now().isoformat()
df_non_missing_data.loc[:, "last_updated"] = pd.Timestamp.now().isoformat()
records = df_missing_data[["id", TARGET_COLUMN, "last_updated"]].to_dict(orient="records")
records[-5:]
records.extend(
    df_non_missing_data[["id", TARGET_COLUMN, "last_updated"]].to_dict(orient="records")
)
records[-5:]

2
db = Database(DB)
[next(db["videos"].rows) for _ in range(5)]
assert len(records) == db["videos"].count
db["videos"].upsert_all(records=records, pk="id")
[next(db["videos"].rows) for _ in range(5)]


1

[{'id': 'PN1q_RrO_6c',
  'comment_count_estimated': 557,
  'last_updated': Timestamp('2023-12-06 09:08:28.887195')},
 {'id': 'FW8Eg1OR9Nc',
  'comment_count_estimated': 557,
  'last_updated': Timestamp('2023-12-06 09:08:28.887195')},
 {'id': 'MRjMbSTZuB0',
  'comment_count_estimated': 557,
  'last_updated': Timestamp('2023-12-06 09:08:28.887195')},
 {'id': '6S6j4MOsCqI',
  'comment_count_estimated': 556,
  'last_updated': Timestamp('2023-12-06 09:08:28.887195')},
 {'id': 'xDlOsVY1R2A',
  'comment_count_estimated': 557,
  'last_updated': Timestamp('2023-12-06 09:08:28.887195')}]

[{'id': 'VRL1hGPOUTo',
  'comment_count_estimated': 835,
  'last_updated': Timestamp('2023-12-06 09:08:28.887397')},
 {'id': 'PwILkY9gRrc',
  'comment_count_estimated': 3700,
  'last_updated': Timestamp('2023-12-06 09:08:28.887397')},
 {'id': 'Hgqp7l9s_9o',
  'comment_count_estimated': 158,
  'last_updated': Timestamp('2023-12-06 09:08:28.887397')},
 {'id': 'EiJWXjz1uks',
  'comment_count_estimated': 396,
  'last_updated': Timestamp('2023-12-06 09:08:28.887397')},
 {'id': '7Zu4ON8D0fk',
  'comment_count_estimated': 863,
  'last_updated': Timestamp('2023-12-06 09:08:28.887397')}]

2

[{'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:07:45.175066'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:07:45.175066'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:07:45.175066'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:07:45.175066'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date':

<Table videos (id, ranking, view_count, comment_count, like_count, upload_date, comment_count_estimated, last_updated)>

[{'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 95,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date':

3

<sqlite3.Cursor at 0x29c493a40>

[{'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20

## 009.005 Clean up

1. Reload DB
    1. Set comment_count_estimated back to 0
1. Print the first 5 records

In [34]:
from sqlite_utils import Database

# solution

1
db = Database(DB)
db.execute(f"update videos set comment_count_estimated=0")

2
[next(db["videos"].rows) for _ in range(5)]

1

<sqlite3.Cursor at 0x29c6b7dc0>

2

[{'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20230913',
  'comment_count_estimated': 0,
  'last_updated': '2023-12-06T09:08:28.887397'},
 {'id': 'Tis5Tm7PAwM',
  'ranking': 1743,
  'view_count': 7831,
  'comment_count': 95,
  'like_count': 800,
  'upload_date': '20