In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Preparation

In [3]:
df = pd.read_csv('ALL_CONVERSATIONS_RAW.csv', names=['conversation_id', 'raw_tweets_info'])
df.head()

Unnamed: 0,conversation_id,raw_tweets_info
0,0,"[(747530913081401344, 19734484)]"
1,1,"[(747530906978627584, 108677957)]"
2,2,"[(747530874573361152, 2912532279)]"
3,3,"[(747530862854610944, 351287982)]"
4,4,"[(747530778960072704, 16522272)]"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072705 entries, 0 to 1072704
Data columns (total 2 columns):
conversation_id    1072705 non-null int64
raw_tweets_info    1072705 non-null object
dtypes: int64(1), object(1)
memory usage: 16.4+ MB


## Data Wrangling

In [5]:
tweet_ids = []
for conv in list(df['raw_tweets_info']):
    ids = []
    for tweet in eval(conv):
        ids.append(tweet[0])
    tweet_ids.append(ids)
print('Sample Tweet ids of conversations')
print(f'{tweet_ids[:5]}\n')

user_ids = []
for conv in list(df['raw_tweets_info']):
    ids = []
    for tweet in eval(conv):
        ids.append(tweet[1]) 
    user_ids.append(ids)

print('Sample user ids in conversation:')
tweet_ids[:5]

Sample Tweet ids of conversations
[[747530913081401344], [747530906978627584], [747530874573361152], [747530862854610944], [747530778960072704]]

Sample user ids in conversation:


[[747530913081401344],
 [747530906978627584],
 [747530874573361152],
 [747530862854610944],
 [747530778960072704]]

In [6]:
df['tweet_ids'] = tweet_ids
df['user_ids'] = user_ids

In [7]:
df['conversation_length'] = [len(eval(tweet)) for tweet in list(df['raw_tweets_info'])]

In [8]:
df.head()

Unnamed: 0,conversation_id,raw_tweets_info,tweet_ids,user_ids,conversation_length
0,0,"[(747530913081401344, 19734484)]",[747530913081401344],[19734484],1
1,1,"[(747530906978627584, 108677957)]",[747530906978627584],[108677957],1
2,2,"[(747530874573361152, 2912532279)]",[747530874573361152],[2912532279],1
3,3,"[(747530862854610944, 351287982)]",[747530862854610944],[351287982],1
4,4,"[(747530778960072704, 16522272)]",[747530778960072704],[16522272],1


In [9]:
df['conversation_length'].value_counts()

1     644078
2     316059
3      72506
4      27504
5       7603
6       3195
7       1011
8        448
9        150
10        76
11        40
12        17
15         6
13         4
16         3
20         2
19         1
14         1
17         1
Name: conversation_length, dtype: int64

In [10]:
# Get only rows where there actually is a conversation
df = df[df['conversation_length'] > 1]

In [11]:
# Black magic
keys = ["KLM", "AirFrance", "British_Airways", "AmericanAir", "Lufthansa", "AirBerlin", "AirBerlin assist", "easyJet", "RyanAir", "SingaporeAir", "Qantas", "EtihadAirways", "VirginAtlantic"]
values = ["56377143", "106062176", "18332190", "22536055", "124476322", "26223583", "2182373406", "38676903", "1542862735", "253340062", "218730857", "45621423", "20626359"]
values = [int(value) for value in values]
airline_dict = dict(zip(keys, [int(value) for value in values]))

airlines = []
for row in list(df['user_ids']):
    if bool(set(list(row)).intersection(values)):
        airline_ids = set((list(row))).intersection(values)
        extracted_airlines = [keys for keys,values in airline_dict.items() if values == list(airline_ids)[0]]
        airlines.append(extracted_airlines[0])
    else:
        airlines.append('No airlines involved')

In [12]:
airlines[:5]

['No airlines involved',
 'No airlines involved',
 'No airlines involved',
 'No airlines involved',
 'No airlines involved']

In [13]:
df['airlines_involved'] = airlines

## Exploration of new dataframe

In [14]:
df.head()

Unnamed: 0,conversation_id,raw_tweets_info,tweet_ids,user_ids,conversation_length,airlines_involved
5,5,"[(747530773037780992, 31132554), (747526794446...","[747530773037780992, 747526794446802944]","[31132554, 22873368]",2,No airlines involved
6,6,"[(747530681509625860, 417945325), (74745123859...","[747530681509625860, 747451238598475776]","[417945325, 55904765]",2,No airlines involved
11,11,"[(747530531475173376, 597673831), (74752831114...","[747530531475173376, 747528311144734720]","[597673831, 427951650]",2,No airlines involved
12,12,"[(747530509098450944, 1624311463), (7475297117...","[747530509098450944, 747529711782244352]","[1624311463, 1624311463]",2,No airlines involved
14,14,"[(747530348561571840, 154605692), (74752081839...","[747530348561571840, 747520818398232576]","[154605692, 117754884]",2,No airlines involved


In [15]:
df['airlines_involved'].value_counts().plot(kind='barh', figsize=(15,5))
plt.title('Airline conversation involvement distribution', weight='bold', fontsize=20)
plt.xlabel('Frequency', fontsize=16)
plt.ylabel('Airline', fontsize=16, rotation=45);
plt.savefig('Airline conversation involvement', dpi=300)

## Save to csv

In [17]:
df.to_csv('conversations_10_jsons.csv', index=False)

## Final Check

In [16]:
df = pd.read_csv('conversations_10_jsons.csv')
display(df.info())
display(df.head())
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 6 columns):
conversation_id        457 non-null int64
raw_tweets_info        457 non-null object
tweet_ids              457 non-null object
user_ids               457 non-null object
conversation_length    457 non-null int64
airlines_involved      457 non-null object
dtypes: int64(2), object(4)
memory usage: 21.5+ KB


None

Unnamed: 0,conversation_id,raw_tweets_info,tweet_ids,user_ids,conversation_length,airlines_involved
0,2,"[(739565730891464705, 2158887092), (7395609423...","[739565730891464705, 739560942376259584]","[2158887092, 2183268807]",2,No airlines involved
1,58,"[(739566848157569027, 157574171), (73951945900...","[739566848157569027, 739519459006816256]","[157574171, 157574171]",2,No airlines involved
2,77,"[(739567295429627904, 219995045), (73955845160...","[739567295429627904, 739558451601248256]","[219995045, 219995045]",2,No airlines involved
3,83,"[(739567393849106432, 79447915), (739515629334...","[739567393849106432, 739515629334626304]","[79447915, 19973673]",2,No airlines involved
4,206,"[(739569215598960640, 141980503), (73919975208...","[739569215598960640, 739199752085082112]","[141980503, 141980503]",2,No airlines involved


Unnamed: 0,conversation_id,conversation_length
count,457.0,457.0
mean,40393.037199,2.019694
std,26844.75497,0.139098
min,2.0,2.0
25%,12826.0,2.0
50%,41958.0,2.0
75%,63804.0,2.0
max,109916.0,3.0
