# Week 6 Visualizing Data Products (Technology & Airline Sectors) 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from textblob import TextBlob
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

In [None]:
file_path = 'twcs.csv'
data_set = pd.read_csv('twcs.csv')

### Technology Data Analysis

#### Step 1. Define the question
What is the relationship between average response time & customer satisfaction for each technology support service?

#### Step 2. Define the ideal data set:
MicrosoftHelps, AppleSupport, DellCares, HPSupport

#### Step 3. Obtain specific data

In [None]:
filtered_data = data_set[data_set['author_id'].str.contains('MicrosoftHelps|AppleSupport|DellCares|HPSupport', case=False)]

#### Display filtered data

In [None]:
filtered_data = data_set[data_set['author_id'].str.contains('MicrosoftHelps|AppleSupport|DellCares|HPSupport', case=False)]

fig = go.Figure(data=[go.Table(
    header=dict(values=list(filtered_data.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[filtered_data.tweet_id, filtered_data.author_id, filtered_data.inbound, filtered_data.created_at, filtered_data.text, filtered_data.response_tweet_id, filtered_data.in_response_to_tweet_id],
               fill_color='lavender',
               align='left'))
])

fig.show()


#### Step 4. Clean the data

In [None]:
# Handle Missing Values
cleaned_data = data_set.dropna()

In [None]:
# Convert to DateTime
data_set['created_at'] = pd.to_datetime(data_set['created_at'])

In [None]:
# Group data by 'author_id' and 'inbound'
grouped_data = filtered_data.groupby(['author_id', 'inbound']).size().reset_index(name='count')

#### Explore & Visulaize distribution of inbound and outbound tweets 

In [None]:
fig = px.bar(grouped_data, x='author_id', y='count', color='inbound',
             labels={'count': 'Number of Tweets', 'inbound': 'Tweet Type'},
             title='Distribution of Inbound and Outbound Tweets for Companies',
             category_orders={'author_id': ['MicrosoftHelps', 'AppleSupport', 'DellCares', 'HPSupport']})

fig.show()


#### Calculating the average response time for each company 

In [None]:
filtered_data = data_set[data_set['author_id'].str.contains('MicrosoftHelps|AppleSupport|DellCares|HPSupport', case=False)]

#### Convert 'created_at' and 'response_tweet_id' columns to datetime

In [None]:
filtered_data['created_at'] = pd.to_datetime(filtered_data['created_at'], errors='coerce').dt.tz_localize(None)
filtered_data['response_tweet_id'] = pd.to_datetime(filtered_data['response_tweet_id'], errors='coerce').dt.tz_localize(None)

#### Filter out rows where 'response_tweet_id' or 'created_at' couldn't be parsed

In [None]:
cleaned_filtered_data = filtered_data.dropna(subset=['created_at', 'response_tweet_id'])

#### Calculate response time in seconds 

In [None]:
cleaned_filtered_data['response_time'] = (cleaned_filtered_data['response_tweet_id'] - cleaned_filtered_data['created_at']).dt.total_seconds()

#### Results for average reponse times

In [None]:
average_response_time = cleaned_filtered_data.groupby('author_id')['response_time'].mean()
print(average_response_time)

#### Visualized the Average Response Time for each Technology Company

In [None]:
colors = ['gold', 'mediumturquoise', 'hotpink', 'lightgreen']

labels = average_response_time.index
values = average_response_time.values

fig = go.Figure(data=[go.Pie(labels=labels, values=values, 
                             hoverinfo='label+percent', 
                             textinfo='value', 
                             textfont_size=20,
                             marker=dict(colors=colors, 
                                         line=dict(color='#000000', width=2)))])
fig.update_layout(title='Average Response Time Distribution', title_font_size=24)
fig.show()


In [None]:
#### Take num of tweets divide by reponse time 

### Airline Data Analysis

##### Step 1. Define the question:  
How does response time correlate to customer satisfaction when comparing American Airlines & Delta?

##### Step 2. Define the ideal data set: 
American Airlines and Delta

##### Step 3. Clean the data: 

In [None]:
# Handle missing values
data_set['response_tweet_id'].fillna(-1, inplace=True)
data_set['in_response_to_tweet_id'].fillna(-1, inplace=True)

**TODO: Insert visualization for data product**

In [None]:
# Step 3. Obtain specific data
filtered_data = data_set[data_set['author_id'].str.contains('delta|american', case=False)]

In [None]:
filtered_data = data_set[data_set['author_id'].str.contains('delta|american', case=False)]

fig = go.Figure(data=[go.Table(
    header=dict(values=list(filtered_data.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[filtered_data.tweet_id, filtered_data.author_id, filtered_data.inbound, filtered_data.created_at, filtered_data.text, filtered_data.response_tweet_id, filtered_data.in_response_to_tweet_id],
               fill_color='lavender',
               align='left'))
])

fig.show()

In [None]:
# Remove duplicates in the filtered data 
filtered_data = filtered_data.drop_duplicates().copy()

**TODO: Insert visualization for data product**

In [None]:
# Convert 'response_tweet_id' and 'in_response_to_tweet_id' to numeric 
filtered_data['response_tweet_id'] = pd.to_numeric(filtered_data['response_tweet_id'], errors='coerce')
filtered_data['in_response_to_tweet_id'] = pd.to_numeric(filtered_data['in_response_to_tweet_id'], errors='coerce')

**TODO: Insert visualization for data product**

In [None]:
# Calculate response time
filtered_data['response_time'] = filtered_data['response_tweet_id'] - filtered_data['in_response_to_tweet_id']

**TODO: Insert visualization for data product**

In [None]:
# Calculate average response time for each airline
average_response_time = filtered_data.groupby('author_id')['response_time'].mean()

**TODO: Insert visualization for data product**

In [None]:
# Visualize the data
plt.figure(figsize=(8, 6))
colors = ['red', 'blue'] 
average_response_time.plot(kind='bar', color=colors, edgecolor='black') 
plt.title('Average Response Time for American Airlines and Delta')
plt.xlabel('Airline')
plt.ylabel('Average Response Time')
plt.xticks(range(len(average_response_time.index)), average_response_time.index, rotation=0)
plt.show()

#### TODO: fix visualization and replace with a better one

##### Step 4. Statistical Predicitions / Modeling

In [None]:
correlation = filtered_data.corr(numeric_only=True)

filtered_data['response_time'] = filtered_data['response_tweet_id'] - filtered_data['in_response_to_tweet_id']

average_response_time = filtered_data.groupby('author_id')['response_time'].mean()

print(average_response_time)

***TODO: Insert better visualization for data product***

#### Convert to postive for better analysis of results

In [None]:
filtered_data['response_time'] = filtered_data['response_tweet_id'] - filtered_data['in_response_to_tweet_id']

average_response_time = filtered_data.groupby('author_id')['response_time'].mean().abs()

print(average_response_time)

***TODO: Insert better visualization for data product***

## Result
American Airlines with an average response time of approximately 983,238.5 units is faster than Delta with an average response time of approximately 1,069,096 units. This can correlate to customers being more satified with American Airlines customer support via the tweet data set

**TODO: Insert visualization for final data product**