# Assessment Title: Cycling 

Analyzing Ireland's Cycling Data and Comparing It with Mexico's and New York's

In [None]:
pip install matplotlib

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
img = mpimg.imread('bike.jpg')
imgplot = plt.imshow(img)
plt.axis('off')
plt.show()

# DUBLIN 

Import of raw data and data cleaning 

In [None]:
dublin1= pd.read_csv('dublin1.csv')
dublin2= pd.read_csv('dublin2.csv')

In [None]:
dublin1.head(1)

In [None]:
dublin2.head(1)

In [None]:
# headers of each file
headers_dublin1 = dublin1.columns
headers_dublin2 = dublin2.columns

# Comparing the headers
if list(headers_dublin1) == list(headers_dublin2):
    print("The datasets have the same headers.")
else:
    print("The datasets have different headers.")

Since they have the same headers they will be concatenated to have only one dataset 

In [None]:
dublin = pd.concat([dublin1, dublin2])
dublin.head(3)

### Exploratory Data Analysis (EDA)

In [None]:
dublin.shape

In [None]:
dublin.info()

In [None]:
dublin.isnull().sum()

In [None]:
dublin['system_id'].value_counts()

In [None]:
dublin['is_installed'].value_counts()

In [None]:
dublin['is_renting'].value_counts()

In [None]:
dublin['is_returning'].value_counts()

In [None]:
dublin= dublin.drop(['short_name', 'region_id','system_id','is_installed','is_renting','is_returning'], axis=1)

In [None]:
dublin.describe()

#### Date and time format change 

In [None]:
dublin.head(1)

In [None]:
dublin['last_reported'] = dublin['last_reported'].astype(str)

In [None]:
dublin[['date', 'time']] = dublin['last_reported'].str.split(' ', expand=True)

In [None]:
dublin['date'] = pd.to_datetime(dublin['date'])

dublin['time'] = pd.to_datetime(dublin['time']).dt.time

In [None]:
dublin.dtypes

In [None]:
dublin= dublin.drop('last_reported', axis=1)

In [None]:
dublin.head(1)

In [None]:
dublin= dublin.drop('address', axis=1)

## Bicycles in use 

In [None]:
dublin['bikes_in_use'] = dublin['capacity'] - dublin['num_bikes_available']
dublin.head(1)

## Analysis of bicycle use in Dublin

### Bikes available

In [None]:
print("Descriptive statistics for number of bikes available:")
print(dublin['num_bikes_available'].describe())

In [None]:
sns.boxplot(x=dublin['num_bikes_available'], color='skyblue')
plt.title('Number of Bikes Available')
plt.xlabel('Number of Bikes Available')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

The mean number of bicycles available per docks is 11 bicycles per station. 

### Stations

In [None]:
stations = dublin['station_id'].nunique()
print(f"Total number of stations: {stations}")

In [None]:
#Stations with Highest bike availability
top_stations = dublin.groupby('name')['num_bikes_available'].mean().sort_values(ascending=False)
print("Stations with highest bike availability:")
print(top_stations.head(10))

In [None]:
# Stations with lowest bike availability
print("Stations with lowest bike availability:")
print(top_stations.tail(10))

In [None]:
top_10_stations = top_stations.head(10)

plt.figure(figsize=(9, 6))
top_10_stations.plot(kind='bar', color='pink')


plt.title('Top 10 Stations by Average Number of Bikes Available',fontsize=13)
plt.xlabel('Station')
plt.ylabel('Average Number of Bikes Available')
plt.xticks(rotation=20, fontsize=6) 


plt.tight_layout()  
plt.show()

## Location of stations with dataset information

### Scatter plot

In [None]:
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly"

In [None]:
fig = px.scatter(
    dublin,
    x='lon',
    y='lat',
    size='capacity',
    color='capacity',
    hover_name='name',
    hover_data={
        'station_id': True,
        'num_bikes_available': True,
        'num_docks_available': True,
        'date': True,
        'time': True
    },
    size_max=60,
    title='Bike Station Capacity',
    labels={'capacity': 'Bike Station Capacity'}
)

# layout 
fig.update_layout(
    xaxis_title='Longitude',
    yaxis_title='Latitude',
    title_font_size=14,
    xaxis_title_font_size=12,
    yaxis_title_font_size=12,
    coloraxis_colorbar_title='Capacity'
)

fig.show()

## Location of stations using an API 

In [None]:
!pip install requests

In [None]:
import requests
import json
from dotenv import load_dotenv
from os import getenv
import os

In [None]:
api_key = os.getenv('API_KEY')
contract_name = os.getenv('CONTRACT_NAME')

In [None]:
# Defining the API URL
url = f'https://api.jcdecaux.com/vls/v1/stations?contract={contract_name}&apiKey={api_key}'

In [None]:
response = requests.get(url)

Since JCDecaux has different contracts in various locations around the world, to obtain accurate information, the contract locations must first be checked to access it.

In [None]:
# API's Key
api_key = '5c20182a11586d9a8e24446b2d69e8c33ff0948c'

# Defining URL 
contracts_url = f'https://api.jcdecaux.com/vls/v1/contracts?apiKey={api_key}'

contracts_response = requests.get(contracts_url)

# Checking if the request was successful
if contracts_response.status_code == 200:
    contracts_data = contracts_response.json()
    # Contracts
    for contract in contracts_data:
        print(contract['name'])
else:
    print(f'Error: {contracts_response.status_code}')

In [None]:
# Defining API and contract's name
api_key = '5c20182a11586d9a8e24446b2d69e8c33ff0948c'
contract_name = 'Dublin'

url = 'https://api.jcdecaux.com/vls/v1/stations'

# Making the request to the API
params = {
    'contract': contract_name,
    'apiKey': api_key
}

response = requests.get(url, params=params)

# Verification    
data = response.json()
if response.status_code == 200:
        print('The request was successful.')
else:
    print('The request was not successful.')

In [None]:
dublin2 = pd.DataFrame(data)
dublin2.head(2)

In [None]:
pip install folium

In [None]:
import folium

def create_station_map(dublin2):
    # Create a map centered on Dublin
    dublin_map = folium.Map(location=[53.3498, -6.2603], zoom_start=13)
    
    # Iterate over the rows of the DataFrame
    for index, row in dublin2.iterrows():
        # Get the position 
        lat = row['position']['lat']
        lng = row['position']['lng']
        station_name = row['name']
        
        # Add a marker to the map
        folium.Marker([lat, lng], popup=station_name).add_to(dublin_map)
    
    # Display the map
    display(dublin_map)

create_station_map(dublin2)

# NEW YORK

Import of raw data and data cleaning 

In [None]:
ny = pd.read_csv('NY1.csv')

In [None]:
ny.head(3)

In [None]:
ny.info()

In [None]:
ny.isnull().sum()

In [None]:
ny= ny.drop(['ride_id','start_station_id', 'end_station_id'], axis=1)

#### Null field

start_station_name 

In [None]:
empty_name = ny[ny['start_station_name'].isnull()]
empty_name

In [None]:
lat = ny[ny['start_lat'] == 40.71]
lat

In [None]:
ny['start_station_name'] = ny['start_station_name'].fillna('other')

#### Date and time format change

In [None]:
ny['started_at'] = pd.to_datetime(ny['started_at'], format='%d/%m/%Y %H:%M')
ny['ended_at'] = pd.to_datetime(ny['ended_at'], format='%d/%m/%Y %H:%M')

In [None]:
# Extracting date and time
ny['date_started'] = ny['started_at'].dt.date
ny['date_end'] = ny['ended_at'].dt.date
ny['time_started'] = ny['started_at'].dt.time
ny['time_end'] = ny['ended_at'].dt.time

In [None]:
ny.head(2)

In [None]:
ny['duration_minutes'] = (ny['ended_at'] - ny['started_at']).dt.total_seconds() / 60

#### Duration of each ride

In [None]:
ny['duration'] = ny['ended_at'] - ny['started_at']

# In minutes
ny['duration_minutes'] = ny['duration'].dt.total_seconds() / 60
ny['duration_minutes'] = ny['duration_minutes'].round(2)

ny = ny.drop('duration', axis=1)

In [None]:
ny= ny.drop(['started_at','ended_at'], axis=1)

In [None]:
ny.head(1)

In [None]:
ny.isnull().sum()

In [None]:
# NY = Individual analysis of New York
# ny = Analysis comparing New York with other countries 

NY= ny

In [None]:
NY= NY.dropna(subset=['end_lat', 'end_lng','end_station_name'])
NY.isnull().sum()

In [None]:
ny= ny.drop(['start_lat','start_lng','rideable_type','end_lat','end_lng','member_casual','end_station_name'], axis=1)
ny.isnull().sum()

In [None]:
ny['bikes_in_use'] = 1

In [None]:
ny.dtypes

In [None]:
ny.describe()

In [None]:
ny.shape

# Analysis of bicycle use in New York

#### Type of bicycle

In [None]:
print("Bicycle Type available:")
print(NY['rideable_type'].value_counts())

In [None]:
rideable_type_counts = NY['rideable_type'].value_counts()
rideable_type_counts.plot(kind='bar', color='darkseagreen')
plt.title('Bicycle Type', fontsize=14)
plt.xlabel('Type' , fontsize=13)
plt.ylabel('Bicycles', fontsize=13)
plt.xticks(rotation=360)
plt.show()

#### Membership Type

In [None]:
print("Rides by Membership Type:")
print(NY['member_casual'].value_counts())

In [None]:
membership_counts = NY['member_casual'].value_counts()

plt.figure(figsize=(5, 6))
plt.pie(membership_counts, labels=membership_counts.index, autopct='%1.1f%%', colors=['khaki', 'darkseagreen'], wedgeprops=dict(width=0.3))
plt.title('Rides by Membership Type')
plt.show()

### Duration of Rides

In [None]:
print("Duration of Rides:")
print(NY['duration_minutes'].describe())

##### Long Trips

In [None]:
long_trips = ny[ny['duration_minutes'] > 20]

num_long_trips = long_trips.shape[0]

print(f"Number of rides longer than 20 minutes: {num_long_trips}")

In [None]:
long_trips = (num_long_trips / ny.shape[0]) * 100
print(f'Percentage of trips more than 20 minutes: {long_trips:.2f}%')

##### Short Trips

In [None]:
short_trips = ny[ny['duration_minutes'] < 20]

num_short_trips = short_trips.shape[0]

print(f'Number of trips less than 20 minutes: {num_short_trips}')

In [None]:
short_trips = (num_short_trips / ny.shape[0]) * 100
print(f'Percentage of trips less than 20 minutes: {short_trips:.2f}%')

In [None]:
labels = ['Trips < 20 min', 'Trips >= 20 min']
sizes = [num_short_trips, num_long_trips]
colors = ['darkseagreen', 'khaki']
explode = (0.1, 0) 

plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=140)
plt.title('Proportion of Trip Durations')
plt.show()

In [None]:
#Distribution of Ride Durations Up to 60 minutes
plt.figure(figsize=(10, 6))
plt.hist(ny['duration_minutes'], bins=range(0, 60, 1), color='khaki', edgecolor='black')
plt.title('Distribution of Ride Durations, Up to 60 minutes')
plt.xlabel('Minutes')
plt.ylabel('Frequency')
plt.xlim(0, 60)
plt.show()

## Data normalization 

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
durations = ny[['duration_minutes']]

scaler = MinMaxScaler()

ny['duration_minutes_normalized'] = scaler.fit_transform(durations)

ny[['duration_minutes', 'duration_minutes_normalized']].head()

## Data Standardized

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

ny['duration_minutes_standardized'] = scaler.fit_transform(durations)


ny[['duration_minutes', 'duration_minutes_standardized']].head()

### Heatmaps for Bike Trip Start and End Locations

In [None]:
from folium.plugins import HeatMap
from IPython.display import IFrame

In [None]:
# Sample of the DataFrame for a more manageable size
sample_df = NY.sample(frac=0.01, random_state=1) 

# Map 
avg_lat = sample_df['start_lat'].mean()
avg_lng = sample_df['start_lng'].mean()
station_map = folium.Map(location=[avg_lat, avg_lng], zoom_start=13)

# Coordinates for the heatmap
start_coords = sample_df[['start_lat', 'start_lng']].dropna().values.tolist() 
end_coords = sample_df[['end_lat', 'end_lng']].dropna().values.tolist() 

# Heatmap to the map
HeatMap(start_coords, radius=10, max_zoom=13).add_to(station_map)
HeatMap(end_coords, radius=10, max_zoom=13).add_to(station_map)

station_map.save('heatmap.html')

In [None]:
IFrame(src='heatmap.html', width=800, height=600)

#### TOP 10 Routes

In [None]:
NY['route'] = NY['start_station_name'] + ' to ' + NY['end_station_name']
route_usage = NY['route'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=route_usage.values, y=route_usage.index, palette='viridis')
plt.title('Top 10 Most Common Routes')
plt.xlabel('Number of Trips')
plt.ylabel('Route')
plt.show()

# MEXICO CITY

In [None]:
mexico = pd.read_csv('mexico.csv')
mexico.head()

#### Exploratory Data Analysis (EDA)

In [None]:
mexico.columns.tolist()

In [None]:
mexico_columns = [
    'ride_gender',
    'ride_age',
    'Bike',
    'start_station_name',
    'date_started',
    'time_started',
    'end_station_name',
    'date_end',
    'time_end'
]

mexico.columns = mexico_columns

In [None]:
mexico.dtypes

In [None]:
mexico.shape

In [None]:
mexico.info()

In [None]:
mexico.isnull().sum()

In [None]:
# mexico = Analysis comparing Mexico City with other countries 
# mexico1= Individual analysis of Mexico City

mexico1= mexico

In [None]:
mexico= mexico.drop(['ride_age','ride_gender','Bike','end_station_name'], axis=1)
mexico.isnull().sum()

In [None]:
mexico1= mexico1.dropna(subset=['ride_age'])
mexico1.isnull().sum()

In [None]:
mexico['bikes_in_use'] = 1

## Analysis of bicycle use in Mexico City

#### Gender

In [None]:
gender_counts = mexico1['ride_gender'].value_counts()
print(f"Registered genders:\n{gender_counts}")

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=gender_counts.index, y=gender_counts.values, palette='crest')
plt.title('Distribution of Rides by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Rides')
plt.show()

#### AGE

In [None]:
print("Cyclists age:")
print(mexico1['ride_age'].describe())

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(mexico1['ride_age'], bins=range(int(mexico1['ride_age'].min()), int(mexico1['ride_age'].max()) + 1, 1), edgecolor='black')
plt.title('Distribution of Cyclists\' Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

#### Rides by Bike ID

In [None]:
top_bikes = mexico1['Bike'].value_counts().head(10)

top_bikes_data = mexico1[mexico1['Bike'].isin(top_bikes.index)]

print("Top 10 most used bikes and their details:")
print(top_bikes_data[['ride_gender', 'ride_age', 'Bike', 'start_station_name']])

In [None]:
plt.barh(top_bikes.index.astype(str), top_bikes.values, color='skyblue', edgecolor='black')
plt.title('Top 10 Most Used Bikes')
plt.xlabel('Rides')
plt.ylabel('Bike ID')
plt.grid(axis='x', linestyle='--', alpha=0.7)

In [None]:
sns.boxplot(x='Bike', y='ride_age', data=top_bikes_data, order=top_bikes.index)
plt.title('Age Distribution for Top 10 Most Used Bikes')
plt.xlabel('Bike ID')
plt.ylabel('Age')
plt.xticks(rotation=20)
plt.show()

### Durations of rides

In [None]:
mexico['start_datetime'] = pd.to_datetime(mexico['date_started'] + ' ' + mexico['time_started'], dayfirst=True)
mexico['end_datetime'] = pd.to_datetime(mexico['date_end'] + ' ' + mexico['time_end'], dayfirst=True)

In [None]:
# Duration in minutes
mexico['duration_minutes'] = (mexico['end_datetime'] - mexico['start_datetime']).dt.total_seconds() / 60.0

In [None]:
# Normalize duration minutes
mexico['duration_minutes_normalized'] = mexico['duration_minutes'] / mexico['duration_minutes'].max()

In [None]:
# Standardize duration minutes
scaler = StandardScaler()
mexico['duration_minutes_standardized'] = scaler.fit_transform(mexico[['duration_minutes']])

In [None]:
mexico.drop(columns=['start_datetime', 'end_datetime'], inplace=True)

# Analysis of the 3 countries 

Format display of the 3 dataframes to be compared

In [None]:
dublin['Country'] = 'Dublin'
dublin.head(1)

In [None]:
ny['Country'] = 'New_York'
ny.head(1)

In [None]:
mexico['Country'] = 'Mexico'
mexico.head(1)

#### Data Frame Preparation for Continued Analysis (1st Analysis (3 Countries))

##### Dublin

In [None]:
Dublin = dublin[['station_id', 'date', 'time', 'bikes_in_use', 'Country']]
Dublin.head(1)

#### New York

In [None]:
ny.head(1)

In [None]:
New_York = ny[['start_station_name', 'date_started', 'time_started', 'bikes_in_use', 'Country']]
New_York.head(1)

In [None]:
New_York.rename(columns={
    'start_station_name': 'station_id',
    'date_started': 'date',
    'time_started': 'time'
}, inplace=True)

In [None]:
New_York.head(1)

##### Mexico

In [None]:
Mexico = mexico[['start_station_name', 'date_started', 'time_started', 'bikes_in_use', 'Country']]
Mexico.head(1)

In [None]:
Mexico.rename(columns={
    'start_station_name': 'station_id',
    'date_started': 'date',
    'time_started': 'time'
}, inplace=True)

In [None]:
Mexico.head(1)

In [None]:
Cycling = pd.concat([Dublin, New_York, Mexico], ignore_index=True)

In [None]:
Cycling.head()

In [None]:
 Cycling.isnull().sum()

In [None]:
Cycling.to_csv('Cycling.csv', index=True)

### Data Frame Preparation for Continued Analysis (2nd Analysis (2 Countries)

In [None]:
mexico.head(1)

In [None]:
ny.head(1)

In [None]:
Cycling2 = pd.concat([ny, mexico], ignore_index=True)

In [None]:
Cycling2.head()

In [None]:
Cycling2.isnull().sum()

In [None]:
Cycling2.to_csv('Cycling2.csv', index=True)

# Sentiment analysis

#### Installation and importing of libraries

In [None]:
pip install tensorflow

In [None]:
!pip install praw
!pip install python-dotenv

In [None]:
import re
import string
from string import punctuation
from nltk.corpus import words
import nltk

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime as dt

import praw
from dotenv import load_dotenv
from os import getenv
nltk.download('vader_lexicon')

In [None]:
load_dotenv()

Login to the account using secret credentials using the notepad with the previously saved .env credentials

In [None]:
APP_NAME = os.getenv('APP_NAME')
APP_ID = os.getenv('APP_ID')
APP_SECRET = os.getenv('APP_SECRET')
USERNAME = os.getenv('USERNAME')
PASSWORD= os.getenv('PASSWORD')

In [None]:
# Checking of all loaded variables

if APP_NAME is None or APP_ID is None or APP_SECRET is None or USERNAME is None or PASSWORD is None:
    print("ERROR: Some credentials not loaded!")
else:
    print("Credentials loaded")

In [None]:
reddit = praw.Reddit(client_id=getenv("APP_ID"),
    client_secret=getenv("APP_SECRET"),
    user_agent=f"pda-2023 u/{getenv('USERNAME')}", 
)

In [None]:
print(reddit.read_only)

In [None]:
subreddit = reddit.subreddit("Python")

In [None]:
headlines = set()

#### Labeling Data

Reddit will be searched for comments made in Ireland about cycling

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [None]:
# Collecting post titles
for submission in reddit.subreddit('ireland').search('cycling'):
    headlines.add(submission.title)
    print(submission.title)

# Applying sentiment analysis 
sia= SIA()
result = []

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headlines'] = line
    result.append(pol_score)

print(result[:3])

#### Converting the results into a DataFrame

In [None]:
df_ireland = pd.DataFrame.from_records(result)
df_ireland.head()

#### Converting label to a numerical variable

Performing the analysis to see if the words of the comments being analyzed are positive, negative or neutral.

In [None]:
df_ireland['label']=0
df_ireland.loc[df_ireland['compound']>0.2,'label']=1
df_ireland.loc[df_ireland['compound']<-0.2,'label']=-1
df_ireland.head()

#### Dataset Info and Stadistics

In [None]:
df_ireland2= df_ireland[['headlines','label']]
df_ireland2.to_csv('reddit_headline_labels.cvs', mode='a',encoding='utf-8',index=False)

In [None]:
df_ireland3 = df_ireland2.copy()

Number of headlines evaluated and whether positive or negative

In [None]:
print(df_ireland3.label.value_counts())
print(df_ireland3.label.value_counts(normalize=True)*100)

### Positive and Negative comments

In [None]:
print("Positive headlines:\n")
print(list(df_ireland2[df_ireland2['label'] == 1]['headlines'])[:5])

print("\nNegative headlines:\n")
print(list(df_ireland2[df_ireland2['label'] == -1]['headlines'])[:5])

#### Comments about cycling in Ireland in percentage 

In [None]:
df_ireland2['sentiment'] = df_ireland2['label'].map({1: 'Positive', 0: 'Neutral', -1: 'Negative'})

# Count the number of each sentiment
sentiment_counts = df_ireland2['sentiment'].value_counts()

plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=sns.color_palette('viridis', len(sentiment_counts)))
plt.title('Comments about cycling %')
plt.show()

#### Comments about cycling in Ireland in number

In [None]:
sns.barplot(x=sentiment_counts.values, y=sentiment_counts.index, palette='viridis')

plt.title('Comments about cycling %')
plt.xlabel('Frequency')
plt.ylabel('Sentiment')

# Show the plot
plt.show()

### Classification

In [None]:
numpy_array = df_ireland3.values
X = numpy_array[:,0]
Y = numpy_array[:,-1]
Y = Y.astype('int')
print("X")
print(X)
print("Y")
print(Y)

#### Vectorization of words

In [None]:
vec = CountVectorizer( )
vec

In [None]:
vec.fit(X)
vec.vocabulary_

It has converted the documents into a set of unique words alphabetically sorted and indexed.

#### Removing the stop words

In [None]:
vec = CountVectorizer(stop_words='english' )
vec.fit(X)
vec.vocabulary_

#### Another way of representing the features

In [None]:
X_transformed=vec.transform(X)
X_transformed

In [None]:
print(X_transformed)

This representation can be understood as follows:

  (0, 7)	1
  (0, 39)	1
  (0, 44)	1
  (0, 82)	1


Consider first 4 rows of the output: It says that the first document (index 0) has 7th, 39th , 44th , 82nd and so on 'word' present in the document, and that they appear only once in the document- indicated by the right hand column entry, and the in the same way the details of the 99 documents.

#### Another interpretable array, converting transformed matrix back to an array.

In [None]:
# The high number of zeros
X=X_transformed.toarray()
X

In [None]:
# converting matrix to dataframe
dfdublin = pd.DataFrame(X, columns=vec.vocabulary_)
pd.DataFrame(X, columns=vec.vocabulary_)

This table shows how many times a particular word occurs in document. In other words, this is a frequency table of the words.

#### Top 20 most Frequent Words

In [None]:
dfdublin = pd.DataFrame(X, columns=vec.get_feature_names_out())

In [None]:
word_freq = dfdublin.sum().sort_values(ascending=False)

# Frequency 
word_freq_df = pd.DataFrame({'word': word_freq.index, 'frequency': word_freq.values})

#Top 20
top_words_df = word_freq_df.head(20)

plt.figure(figsize=(12, 8))
sns.barplot(x='frequency', y='word', data=top_words_df, palette='viridis')
plt.title('Top 20 Most Frequent Words')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

### Word frequency

Plot the word cloud for all the key words without prepocessing it

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud 

In [None]:
wordcloud = WordCloud(random_state=21, max_font_size=119, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(19, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
dfdublin

### TEST

In [None]:
test_numpy_array = df_ireland3.values
X_test = test_numpy_array[:,0]
Y_test = test_numpy_array[:,1]
Y_test = Y_test.astype('int')
print("X_test")
print(X_test)
print("Y_test")
print(Y_test)

In [None]:
X_test_transformed=vec.transform(X_test)
X_test_transformed

In [None]:
X_test=X_test_transformed.toarray()
X_test

### Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

In [None]:
# Bernoulli NB class
bnb=BernoulliNB()

# fitting model
bnb.fit(X,Y)

# Predicting probability of test data
bnb.predict_proba(X_test)
proba_bnb=bnb.predict_proba(X_test)

In [None]:
proba_df = pd.DataFrame(proba_bnb, columns=['Negative','Neutral','Positive'])
pd.DataFrame(proba_bnb, columns=['Negative','Neutral','Positive'])

#### Prediction

In [None]:
y_pred_bnb = bnb.predict(X_test)

predictions_df = proba_df.copy()
predictions_df['Predicted Class'] = y_pred_bnb

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
y_test = Y_test

Confusion_matrix

In [None]:
y_pred_bnb = bnb.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred_bnb)

print(conf_matrix)

In [None]:
class_names = ['Negative', 'Neutral', 'Positive']
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()