# Synthetic CDR files for testing

This file creates synthetic "raw" CDRs and saves them as daily .csv-files. The format of the CDRs follows the structure laid out in the "Covariate Creation Process" document.

Following features are included in the CDRs:
- Off-net / international events (via missing records)
- Different activity levels of users (via sample weights)

Following features could be included in the future:
- Spatio-temporal dependencies and paths for users
- (Social) network structures among users

## Modules

In [None]:
import os  # operating system functions like renaming files and directories
import shutil  # recursive file and directory operations
import datetime as dt  # handling timestamps
import random as rd  # generating random numbers for distributions
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from random import choices  # to draw from weighted distribution
from attributes_classes import MockupAttributes  # module intern attributes

## Parameters from MockupAttributes class & random seed

In [None]:
moa = MockupAttributes(n_cells=150, n_users=500, call_unit='m', 
                       max_call_duration=120, date_format='%d %b %Y %H:%M:%S', 
                       date_window=['2018-06-01', '2018-06-30'],
                       raw_header=False, location_header=False,
                       output_path='../data/')

In [None]:
rd.seed(6041)

## Create cells & antennas with specific coordinates

In [None]:
tow = pd.DataFrame(data={moa.loc_column_names['cell']: rd.sample(range(moa.n_cells, moa.n_cells*10), moa.n_cells)})

In [None]:
tow[moa.loc_column_names['antenna']] = np.floor(tow[moa.loc_column_names['cell']]/moa.n_cells_p_antenna).astype(int)

In [None]:
# unique antennas
u_antennas = pd.unique(tow[moa.loc_column_names['antenna']])

In [None]:
tow = pd.merge(tow,pd.DataFrame(data={moa.loc_column_names['antenna']: u_antennas,
                                      moa.loc_column_names['long']: np.random.uniform(moa.long_range[0], moa.long_range[1], size = len(u_antennas)),
                                      moa.loc_column_names['lat']: np.random.uniform(moa.lat_range[0], moa.lat_range[1], size = len(u_antennas))}),
               how = 'left',
                on = moa.loc_column_names['antenna'])

## Create indicator for Out-Of-Area (OOA) traffic

In [None]:
# create a border of 0.5 around the created towers, remember towers outside of this border
tow_out = tow.query(moa.loc_column_names['long']+'<'+str(moa.long_range[0]+0.5)+'|'+
                   moa.loc_column_names['long']+'>'+str(moa.long_range[1]-0.5)+'|'+
                   moa.loc_column_names['lat']+'<'+str(moa.lat_range[0]+0.5)+'|'+
                   moa.loc_column_names['lat']+'>'+str(moa.lat_range[1]-0.5))[moa.loc_column_names['cell']]

## Create population of MSISDNs with different activity levels

In [None]:
id_pop = np.round(np.random.uniform(100000000, 1000000000, size = moa.n_users), decimals = 2)

In [None]:
weights = np.random.normal(1, 0.2, size = moa.n_users)

In [None]:
weights = np.where(weights < 0, 0, weights)

In [None]:
weights = weights/sum(weights)  # standardize

## Create basic outgoing frame

In [None]:
# if seconds, keep as it is
# if minutes round it to 2 digits after the comma
call_unit_multiplier = 1 if moa.call_unit=='s' else 100

In [None]:
cdr_out = pd.DataFrame(data={moa.raw_column_names['type']: 2,
                             moa.raw_column_names['msisdn']: np.random.choice(id_pop, p = weights, size = moa.n_total_events, replace = True),
                             moa.raw_column_names['date']: np.random.choice(pd.date_range(moa.date_window[0]+' 00:00:00',
                                                                        moa.date_window[1]+' 23:59:59',
                                                                        freq='S'),
                                                       size = moa.n_total_events, replace = True),
                            moa.raw_column_names['service']: np.random.randint(1, 3, size = moa.n_total_events),
                            moa.raw_column_names['location']: np.random.choice(tow[moa.loc_column_names['cell']], size = moa.n_total_events, replace = True),
                            moa.raw_column_names['partner_type']: choices([1,2], [0.99,0.01], k = moa.n_total_events),
                            moa.raw_column_names['partner']: np.random.choice(id_pop, p = weights, size = moa.n_total_events, replace = True),
                            moa.raw_column_names['tac']: np.random.uniform(1000000, 10000000, size = moa.n_total_events),
                            moa.raw_column_names['duration']: np.random.randint(0, moa.max_call_duration*call_unit_multiplier, 
                                                                                size = moa.n_total_events)/call_unit_multiplier})

In [None]:
# create chunk for daily saves (later on)
cdr_out['chunk'] = cdr_out[moa.raw_column_names['date']].dt.strftime('%Y%m%d')

In [None]:
# altering functions to comply with given format
if moa.date_format is not None:
    cdr_out[moa.raw_column_names['date']] = cdr_out[moa.raw_column_names['date']]\
                                            .map(lambda x: str(x.strftime(moa.date_format)).upper())

## Create basic incoming frame

In [None]:
cdr_in = pd.DataFrame(data={moa.raw_column_names['type']: 1,
                            moa.raw_column_names['msisdn']: cdr_out[moa.raw_column_names['partner']],
                            moa.raw_column_names['date']: cdr_out[moa.raw_column_names['date']],
                            moa.raw_column_names['service']: cdr_out[moa.raw_column_names['service']],
                            moa.raw_column_names['location']: np.random.choice(tow[moa.loc_column_names['cell']], size = moa.n_total_events, replace = True),
                            moa.raw_column_names['partner_type']: cdr_out[moa.raw_column_names['type']],
                            moa.raw_column_names['partner']: cdr_out[moa.raw_column_names['msisdn']],
                            moa.raw_column_names['tac']: np.random.uniform(1000000, 10000000, size = moa.n_total_events),
                            moa.raw_column_names['duration']: cdr_out[moa.raw_column_names['duration']],
                            'chunk': cdr_out['chunk']})

## Join to one frame

In [None]:
cdr = cdr_in.append(cdr_out)
# release memory
del [cdr_in, cdr_out]

## Postprocessing

In [None]:
# Remove OOA records
cdr = cdr[~cdr[moa.raw_column_names['location']].isin(tow_out)]

In [None]:
# Delete records of self-calling/-texting
cdr = cdr[cdr[moa.raw_column_names['msisdn']] != cdr[moa.raw_column_names['partner']]]

In [None]:
# Set call duration for SMS to NA
cdr[moa.raw_column_names['duration']] = np.where(cdr[moa.raw_column_names['service']] == 2, None,
                                                 cdr[moa.raw_column_names['duration']])

## Exporting

### daily CDRs to .csv

In [None]:
days = pd.date_range(moa.date_window[0], moa.date_window[1]).strftime('%Y%m%d')

In [None]:
# delete folder if it exists and create empty new one
if os.path.exists(moa.output_path):
    shutil.rmtree(moa.output_path)
os.makedirs(moa.output_path)

In [None]:
for i in days:
    temp = cdr[cdr['chunk'] == i].drop('chunk', axis=1)
    temp.to_csv(moa.output_path + '%s.csv'%i, index=False, header=moa.raw_header)

### tower locations to .csv

In [None]:
tow.to_csv(moa.output_path + moa.loc_file_name, index=False, header=moa.location_header)