In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import json
pd.set_option('display.max_colwidth', None)


Mounted at /content/drive


In [2]:
file_path = '/content/drive/MyDrive/UCB-MIDS/SEM-6/Capstone/Compliance/OPP-115/annotations/'
annotations_df = pd.read_csv(file_path + '1713_latinpost.com.csv')

# Inspect the first few rows of the data
annotations_df.head(1)


Unnamed: 0,21360,test_category_labeling_highlight_upitt,95,3843,0,Other,"{""Other Type"": {""selectedText"": ""Privacy Policy Last Modified: March 25, 2013"", ""startIndexInSegment"": 0, ""endIndexInSegment"": 54, ""value"": ""Introductory/Generic""}}",3/25/13,http://www.latinpost.com/privacypolicy
0,20756,test_category_labeling_highlight_upitt,103,3843,1,Other,"{""Other Type"": {""selectedText"": ""This is the privacy policy (\""Privacy Policy\"") for LatinPost.com (\""Site,\"" \""we,\"" \""us,\"" or \""our\""). This Privacy Policy is intended to explain our privacy practices and covers the following areas: When This Privacy Policy Applies. U.S. Governing Law; Safe Harbor. Collection and Use of Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Opting-out of Certain Uses of your Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Wireless Marketing Services and Associated Promotional Opportunities. Sharing and Disclosure of Personal Information and Other Information. Reviewing, Updating or Deleting Certain Information. Protection of Information. Your California Privacy Rights. Changes to this Privacy Policy and Notice. Miscellaneous."", ""startIndexInSegment"": 0, ""endIndexInSegment"": 1118, ""value"": ""Introductory/Generic""}}",3/25/13,http://www.latinpost.com/privacypolicy


In [3]:
annotations_df.shape

(531, 9)

In [None]:
annotations_df.columns

Index(['21360', 'test_category_labeling_highlight_upitt', '95', '3843', '0',
       'Other',
       '{"Other Type": {"selectedText": "Privacy Policy Last Modified: March 25, 2013", "startIndexInSegment": 0, "endIndexInSegment": 54, "value": "Introductory/Generic"}}',
       '3/25/13', 'http://www.latinpost.com/privacypolicy'],
      dtype='object')

In [None]:
column_headers = ['doc_id', 'test_category', 'segment_number', 'unique_polciy_identifier', 'label', 'policy_category', 'segment_details', 'policy_date', 'url']

In [None]:
annotations_df.columns = column_headers

In [None]:
annotations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 531 entries, 0 to 530
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   doc_id                    531 non-null    int64 
 1   test_category             531 non-null    object
 2   segment_number            531 non-null    int64 
 3   unique_polciy_identifier  531 non-null    int64 
 4   label                     531 non-null    int64 
 5   policy_category           531 non-null    object
 6   segment_details           531 non-null    object
 7   policy_date               531 non-null    object
 8   url                       531 non-null    object
dtypes: int64(4), object(5)
memory usage: 37.5+ KB


In [None]:
annotations_df['policy_date'] = pd.to_datetime(annotations_df['policy_date'], format='%m/%d/%y')

annotations_df['year'] = annotations_df['policy_date'].dt.year
annotations_df[['policy_date', 'year']].head()


Unnamed: 0,policy_date,year
0,2013-03-25,2013
1,2013-03-25,2013
2,2013-03-25,2013
3,2013-03-25,2013
4,2013-03-25,2013


In [None]:
annotations_df.head(1)

Unnamed: 0,doc_id,test_category,segment_number,unique_polciy_identifier,label,policy_category,segment_details,policy_date,url,year
0,20756,test_category_labeling_highlight_upitt,103,3843,1,Other,"{""Other Type"": {""selectedText"": ""This is the privacy policy (\""Privacy Policy\"") for LatinPost.com (\""Site,\"" \""we,\"" \""us,\"" or \""our\""). This Privacy Policy is intended to explain our privacy practices and covers the following areas: When This Privacy Policy Applies. U.S. Governing Law; Safe Harbor. Collection and Use of Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Opting-out of Certain Uses of your Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Wireless Marketing Services and Associated Promotional Opportunities. Sharing and Disclosure of Personal Information and Other Information. Reviewing, Updating or Deleting Certain Information. Protection of Information. Your California Privacy Rights. Changes to this Privacy Policy and Notice. Miscellaneous."", ""startIndexInSegment"": 0, ""endIndexInSegment"": 1118, ""value"": ""Introductory/Generic""}}",2013-03-25,http://www.latinpost.com/privacypolicy,2013


In [None]:
print(annotations_df.shape)

(531, 10)


### 1. Policy category distribution

In [None]:
category_count = annotations_df['policy_category'].value_counts()

fig = px.bar(category_count,
             x=category_count.index,
             y=category_count.values,
             labels={'x': 'Policy Category', 'y': 'Count'},
             title='Distribution of Policy Categories',
             width=1000,
             height=600)

fig.update_layout(title={'text': 'Distribution of Policy Categories', 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'},xaxis_title='Policy Category' )

fig.show()


### 2. Text Length analysis

In [None]:
def extract_selected_text(segment):
    try:
        segment_dict = json.loads(segment)  # Convert string to dictionary
        # Iterate through all possible sub-fields that contain 'selectedText'
        for key, value in segment_dict.items():
            if 'selectedText' in value:
                return value.get("selectedText", "")
        return ""
    except json.JSONDecodeError:
        return ""

# Apply the function to extract 'selectedText' across all possible fields
annotations_df['selectedText'] = annotations_df['segment_details'].apply(extract_selected_text)

### 3. Distribtuion of Text Length

In [None]:
# Create a new column 'text_length' by applying len() on 'selectedText'
annotations_df['text_length'] = annotations_df['selectedText'].apply(len)

# Display the updated DataFrame
annotations_df[['policy_category', 'selectedText', 'text_length']].head(2)


Unnamed: 0,policy_category,selectedText,text_length
0,Other,"This is the privacy policy (""Privacy Policy"") for LatinPost.com (""Site,"" ""we,"" ""us,"" or ""our""). This Privacy Policy is intended to explain our privacy practices and covers the following areas: When This Privacy Policy Applies. U.S. Governing Law; Safe Harbor. Collection and Use of Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Opting-out of Certain Uses of your Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Wireless Marketing Services and Associated Promotional Opportunities. Sharing and Disclosure of Personal Information and Other Information. Reviewing, Updating or Deleting Certain Information. Protection of Information. Your California Privacy Rights. Changes to this Privacy Policy and Notice. Miscellaneous.",878
1,Other,"This is the privacy policy (""Privacy Policy"") for LatinPost.com (""Site,"" ""we,"" ""us,"" or ""our""). This Privacy Policy is intended to explain our privacy practices and covers the following areas: When This Privacy Policy Applies. U.S. Governing Law; Safe Harbor. Collection and Use of Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Opting-out of Certain Uses of your Personal Information and Other Information (Other than Wireless Marketing Services and Associated Promotional Opportunities). Wireless Marketing Services and Associated Promotional Opportunities. Sharing and Disclosure of Personal Information and Other Information. Reviewing, Updating or Deleting Certain Information. Protection of Information. Your California Privacy Rights. Changes to this Privacy Policy and Notice. Miscellaneous.",878


In [None]:
fig = px.histogram(annotations_df, x='text_length', nbins=50, labels={'text_length': 'Text Length'},
                   title='Distribution of Policy Segment Length in Selected Segments')

# Update layout with centered title and custom size
fig.update_layout(title={'x':0.5, 'xanchor': 'center'},
                  xaxis_title='Text Length',
                  yaxis_title='Count',
                  width=800,  # Set width
                  height=500)  # Set height

fig.show()


In [None]:
empty_selected_text_count = (annotations_df['selectedText'].str.strip() == '').sum()
print(f"Number of empty selectedText entries: {empty_selected_text_count}")


Number of empty selectedText entries: 0


In [None]:
fig = px.box(annotations_df, x='policy_category', y='text_length',
             labels={'policy_category': 'Policy Category', 'text_length': 'Text Length'},
             title='Policy Length Distribution Across Policy Categories')
fig.update_layout(title={'x':0.5, 'xanchor': 'center'}, width=1000, height=600)
fig.show()
