In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
raw_df = pd.read_csv("../data/content.csv")
pvs = pd.read_csv('../data/pvs.csv')
visits = pd.read_csv('../data/visits.csv')

# Basic data cleaning and processing
### Clean the visit records and page view records

In [221]:
# There are 346609 page views
pvs.head() # 346609 rows × 3 columns

Unnamed: 0,visitID,visitorID,contentID
0,1531854699,201104700000000.0,www.arkansasonline.com/news/2018/jul/17/author...
1,1532645198,201104700000000.0,www.arkansasonline.com/news/2011/nov/15/walton...
2,1533148392,201104700000000.0,www.arkansasonline.com/news/2018/aug/01/blaze-...
3,1533827139,201104700000000.0,www.arkansasonline.com/news/2018/aug/08/sherif...
4,1533827139,201104700000000.0,www.arkansasonline.com/news/2018/aug/09/top-6-...


In [222]:
pvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346609 entries, 0 to 346608
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   visitID    346609 non-null  int64  
 1   visitorID  346609 non-null  float64
 2   contentID  293731 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 7.9+ MB


A lot of page views are missing `contentID`.

In [3]:
# There are 256121 visits
visits.head() # 256121 rows × 10 columns
# keep the 'visitID', 'visitorID' and 'visitDateTime' columns
visits = visits[['visitID', 'visitorID', 'visitDateTime']]

In [224]:
print(f"The earlist records: {min(visits['visitDateTime'])}")
print(f"The latest records: {max(visits['visitDateTime'])}")

The earlist records: 2018-07-01T05:02:22Z
The latest records: 2018-10-01T04:45:18Z


In [225]:
visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256121 entries, 0 to 256120
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   visitID        256121 non-null  int64  
 1   visitorID      256121 non-null  float64
 2   visitDateTime  256121 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 5.9+ MB


There are no missing values in visit records.

Each visit may have more than one page view.
Then, left join the visit records on the page view records and drop incomplete rows to get the complete visit records with visit time. 

In [4]:
# join the page views and visits together to know the time for each page view
user_visits = pd.merge(pvs,visits, on = ('visitorID','visitID'), how='left').dropna().reset_index(drop=True)
user_visits

Unnamed: 0,visitID,visitorID,contentID,visitDateTime
0,1531854699,2.011047e+14,www.arkansasonline.com/news/2018/jul/17/author...,2018-07-17T19:11:39Z
1,1532645198,2.011047e+14,www.arkansasonline.com/news/2011/nov/15/walton...,2018-07-26T22:46:38Z
2,1533148392,2.011047e+14,www.arkansasonline.com/news/2018/aug/01/blaze-...,2018-08-01T18:33:12Z
3,1533827139,2.011047e+14,www.arkansasonline.com/news/2018/aug/08/sherif...,2018-08-09T15:05:39Z
4,1533827139,2.011047e+14,www.arkansasonline.com/news/2018/aug/09/top-6-...,2018-08-09T15:05:39Z
...,...,...,...,...
293032,1537018622,1.805825e+19,www.arkansasonline.com/news/2018/sep/15/key-pi...,2018-09-15T13:37:02Z
293033,1537285726,1.805825e+19,www.arkansasonline.com/news/2018/sep/18/distri...,2018-09-18T15:48:46Z
293034,1537298388,1.805825e+19,www.arkansasonline.com/news/2018/sep/18/little...,2018-09-18T19:19:48Z
293035,1537318704,1.805825e+19,www.arkansasonline.com/news/2018/sep/18/nation...,2018-09-19T00:58:24Z


### Basic cleaning on the article features

In [5]:
# Remove redundant article titles and only keep one title for each article
content_df = raw_df.drop_duplicates(subset='contentID', keep='first', inplace=False)  # 12491 rows × 7 columns
content_df.head()

Unnamed: 0,contentID,URL,headline,author,cmsCategories,nl_category_structured,nl_category
0,www.arkansasonline.com/news/2018/aug/23/sherif...,https://www.arkansasonline.com/news/2018/aug/2...,14-year-old dies days after being accidentally...,Jaime Dunaway,"/News, /News/Arkansas, /News/Arkansas/Southeast",,
1,www.arkansasonline.com/news/2018/aug/25/woods-...,https://www.arkansasonline.com/news/2018/aug/2...,Woods impresses despite stumbles,Tom Murphy,"/Sports, /Sports/College/Football, /Sports/Col...",,
2,www.arkansasonline.com/news/2018/aug/25/newton...,https://www.arkansasonline.com/news/2018/aug/2...,Newton has injury scare; Panthers win,The Associated Press,"/Sports, /Sports/Pros/Football",,
3,www.arkansasonline.com/news/2018/aug/24/arkans...,https://www.arkansasonline.com/news/2018/aug/2...,North Little Rock man accidentally shoots moth...,Jaime Dunaway,"None/Nlr, /News, /News/Arkansas, /News/Arkansa...",,
4,www.arkansasonline.com/news/2018/aug/25/wolver...,https://www.arkansasonline.com/news/2018/aug/2...,Wolverines depending on ex-Ole Miss QB Patterson,The Associated Press,"/Sports, /Sports/College/Football",,


There are 12491 articles from browsing records.

In [228]:
content_df['author'].value_counts() # There are 1436 authors
content_df['author'].isnull().sum() # There are 1519 articles with missing author records

1519

In [229]:
content_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12491 entries, 0 to 16546
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   contentID               12491 non-null  object
 1   URL                     12491 non-null  object
 2   headline                12491 non-null  object
 3   author                  10972 non-null  object
 4   cmsCategories           12485 non-null  object
 5   nl_category_structured  2200 non-null   object
 6   nl_category             2200 non-null   object
dtypes: object(7)
memory usage: 780.7+ KB


There are a lot null values in `nl_category_structured` and `nl_category` columns so I will drop these two columns

In [6]:
# replace the null values in the categories with spaces
# build the article profile based on article titles and article types 
# combine all the categories into one column and remove the rest category columns
content_df = content_df.fillna(' ')
content_df['categories'] = content_df['cmsCategories']+ content_df['nl_category_structured']+content_df['nl_category']
del content_df['cmsCategories']
del content_df['nl_category_structured']
del content_df['nl_category']
del content_df['URL']
del content_df['author']
content_df = content_df.reset_index(drop=True)

In [231]:
content_df

Unnamed: 0,contentID,headline,categories
0,www.arkansasonline.com/news/2018/aug/23/sherif...,14-year-old dies days after being accidentally...,"/News, /News/Arkansas, /News/Arkansas/Southeast"
1,www.arkansasonline.com/news/2018/aug/25/woods-...,Woods impresses despite stumbles,"/Sports, /Sports/College/Football, /Sports/Col..."
2,www.arkansasonline.com/news/2018/aug/25/newton...,Newton has injury scare; Panthers win,"/Sports, /Sports/Pros/Football"
3,www.arkansasonline.com/news/2018/aug/24/arkans...,North Little Rock man accidentally shoots moth...,"None/Nlr, /News, /News/Arkansas, /News/Arkansa..."
4,www.arkansasonline.com/news/2018/aug/25/wolver...,Wolverines depending on ex-Ole Miss QB Patterson,"/Sports, /Sports/College/Football"
...,...,...,...
12486,www.arkansasonline.com/news/2018/jun/29/five-m...,Five more movie classics finally screened,"/Entertainment/Movies, /Arts & Entertainment/M..."
12487,www.arkansasonline.com/news/2018/jul/29/kavana...,Kavanaugh rulings a hit for NRA,"/News/National, /News/Politics/National/Law & ..."
12488,www.arkansasonline.com/news/2018/sep/09/he-s-g...,He's gonna get in trouble with this,"/Editorial, /Editorial/Columns/News/Politics, ..."
12489,www.arkansasonline.com/news/2017/oct/29/blind-...,RIGHT TIME RIGHT PLACE: RIGHT TIME RIGHT PLACE...,/Entertainment/Features/Highprofile


In [7]:
content_df.to_csv('../data/cleaned/content.csv', index = False)

### Completed records
Inner join the article content records with visit records

In [8]:
total_info = pd.merge(content_df,user_visits, on = ('contentID'), how='inner').sort_values('visitDateTime').reset_index(drop=True)
total_info['visitDateTime'] = pd.to_datetime(total_info['visitDateTime']).dt.date
total_info.head()

Unnamed: 0,contentID,headline,categories,visitID,visitorID,visitDateTime
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
1,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
2,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
3,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
4,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime,1530422162,9.015515e+18,2018-07-01


In [15]:
total_info

Unnamed: 0,contentID,headline,categories,visitID,visitorID,visitDateTime
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
1,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
2,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
3,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
4,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime,1530422162,9.015515e+18,2018-07-01
...,...,...,...,...,...,...
292779,www.arkansasonline.com/news/2018/sep/30/buffal...,Buffalo ribs with Cam,"/Editorial, /Editorial/Columns, /Editorial/Col...",1538367030,3.903908e+18,2018-10-01
292780,www.arkansasonline.com/news/2018/sep/30/foster...,"In Arkansas, foster youths who age out often a...",/News/Arkansas/People & Society/People & Society,1538367692,6.027405e+16,2018-10-01
292781,www.arkansasonline.com/news/2018/sep/30/foster...,"In Arkansas, foster youths who age out often a...",/News/Arkansas/People & Society/People & Society,1538367692,6.027405e+16,2018-10-01
292782,www.arkansasonline.com/news/2018/sep/30/foster...,"In Arkansas, foster youths who age out often a...",/News/Arkansas/People & Society/People & Society,1538367692,6.027405e+16,2018-10-01


In [16]:
total_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292784 entries, 0 to 292783
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   contentID      292784 non-null  object 
 1   headline       292784 non-null  object 
 2   categories     292784 non-null  object 
 3   visitID        292784 non-null  int64  
 4   visitorID      292784 non-null  float64
 5   visitDateTime  292784 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 13.4+ MB


In [17]:
total_info.nunique()

contentID         11455
headline          10280
categories         2221
visitID          106555
visitorID          3421
visitDateTime        93
dtype: int64

In [18]:
print(f"The earlist records: {min(total_info['visitDateTime'])}")
print(f"The latest records: {max(total_info['visitDateTime'])}")

The earlist records: 2018-07-01
The latest records: 2018-10-01


In [9]:
total_info.to_csv("../data/cleaned/total_info.csv", index = False)
total_info.drop_duplicates(subset=('visitID','visitorID','visitDateTime'), keep='first', inplace=False)

Unnamed: 0,contentID,headline,categories,visitID,visitorID,visitDateTime
0,www.arkansasonline.com/news/2018/jun/30/arrest...,Arrested state legislator urged to quit post,"/News/Arkansas, /News/Politics/Arkansas/Law & ...",1530421855,5.364833e+18,2018-07-01
4,www.arkansasonline.com/news/2018/jun/30/suspec...,Suspect arrested in fatal shooting in Pulaski ...,/News/Arkansas/Crime,1530422162,9.015515e+18,2018-07-01
5,www.arkansasonline.com/news/2018/jun/30/police...,Police: Officer fatally shoots self at central...,/News/Arkansas/Law & Government/Public Safety/...,1530423657,1.343160e+18,2018-07-01
9,www.arkansasonline.com/news/2018/may/03/1-kill...,Driver dies after losing control of Corvette i...,"/News/Arkansas, /News/Fatalwrecks, /Social",1530423935,4.705169e+18,2018-07-01
13,www.arkansasonline.com/news/2018/apr/24/facebo...,Facebook profile lands Arkansas sex offender i...,"/News/Arkansas, /News/Arkansas/Crime/Law & Gov...",1530424161,4.705169e+18,2018-07-01
...,...,...,...,...,...,...
292773,www.arkansasonline.com/news/2018/sep/30/deadli...,Deadline looms; health coverage at risk for many,"/News/Arkansas, /News/Healthcare",1538366477,9.595882e+15,2018-10-01
292774,www.arkansasonline.com/news/2018/sep/29/texas-...,Texas A&M;'s six-year reign,"/Sports, /Sports/College/Razorbacks/Razobacks-...",1538366670,5.290775e+17,2018-10-01
292778,www.arkansasonline.com/news/2018/sep/30/means-...,Means to an end,"/Editorial, /Editorial/Columns, /Editorial/Col...",1538367023,3.903908e+18,2018-10-01
292779,www.arkansasonline.com/news/2018/sep/30/buffal...,Buffalo ribs with Cam,"/Editorial, /Editorial/Columns, /Editorial/Col...",1538367030,3.903908e+18,2018-10-01


### Split the records into training set, validation set and test set

As the records are collected during 3 months (93 days). I will split the record  into training set, validation set and test set according to the ratio of 70:15:15 (65:14:14)

In [10]:
train_df = total_info.loc[total_info['visitDateTime'] <= pd.to_datetime('2018-09-03')]
valid_df = total_info.loc[(total_info['visitDateTime'] > pd.to_datetime('2018-09-03')) & (total_info['visitDateTime'] <= pd.to_datetime('2018-09-17'))]
test_df = total_info.loc[total_info['visitDateTime'] > pd.to_datetime('2018-09-17')]

train_df.to_csv('../data/cleaned/train_df.csv', index = False)
valid_df.to_csv('../data/cleaned/valid_df.csv', index = False)
test_df.to_csv('../data/cleaned/test_df.csv', index = False)

In [11]:
print("train set")
print(f"{min(train_df['visitDateTime'])} - {max(train_df['visitDateTime'])}")
print(f"{(train_df['visitDateTime']).nunique()} days")
print(f"{(train_df['contentID']).nunique()} articles")
print(f"{(train_df['visitorID']).nunique()} visitors")
print(f"Total records: {len(train_df)}")
print("------------------")
print("valid set")
print(f"{min(valid_df['visitDateTime'])} - {max(valid_df['visitDateTime'])}")
print(f"{(valid_df['visitDateTime']).nunique()} days")
print(f"{(valid_df['contentID']).nunique()} articles")
print(f"{(valid_df['visitorID']).nunique()} visitors")
print(f"Total records: {len(valid_df)}")
print("------------------")
print("test set")
print(f"{min(test_df['visitDateTime'])} - {max(test_df['visitDateTime'])}")
print(f"{(test_df['visitDateTime']).nunique()} days")
print(f"{(test_df['contentID']).nunique()} articles")
print(f"{(test_df['visitorID']).nunique()} visitors")
print(f"Total records: {len(test_df)}")

train set
2018-07-01 - 2018-09-03
65 days
8529 articles
2884 visitors
Total records: 199936
------------------
valid set
2018-09-04 - 2018-09-17
14 days
1853 articles
1868 visitors
Total records: 52824
------------------
test set
2018-09-18 - 2018-10-01
14 days
1773 articles
1625 visitors
Total records: 40024
