# Synthetic CDR files for testing (sample_data)

This file creates synthetic "raw" CDRs and saves them as daily .csv-files.

The format of these CDRs is defined by the following structure which can be adjusted by the user:


| call_record_type | caller_msisdn | call_date          | basic_service | cell_id            | call_partner_identity_type | call_partner_identity | tac_code | call_duration |
|------------------|---------------|--------------------|---------------|--------------------|----------------------------|-----------------------|----------|---------------|
| 2                | ---           | 2018-01-01 0:00:00 | 1             | 608-01-09004-02971 | 1                          | ---                   | ---      | 1             |
| 1                | ---           | 2018-01-01 0:00:00 | 2             | 608-01-00820-03058 | 1                          | ---                   | ---      | NA            |
| 1                | ---           | 2018-01-01 0:00:01 | 1             | 608-01-00111-01429 | 1                          | ---                   | ---      | 9             |

Where the call_record_type defines the status of an event (1=incoming or 2=outgoing), the call_date is the exact timestamp and the basic_service defines the nature of an event (1=call or 2=SMS). The cell_id corresponds to a Base Transceiver Station (BTS) located at an antenna and the call_partner_identity_type describes if it is a national (1) or international (2) event. Call_duration defines the length in seconds (SMS are coded as NA). The tac_code refers to a user’s mobile and the caller_msisdn as well as the call_partner_identity identify the initiator and receiver of an event. These three variables are the ones most critical for privacy concerns, but are not visible in the final features dataset.

Following features are included in the CDRs:
- Off-net / international events (via missing records)
- Different activity levels of users (via sample weights)

Following features could be included in the future:
- Spatio-temporal dependencies and paths for users
- (Social) network structures among users

## Modules

In [1]:
import os  # operating system functions like renaming files and directories
import shutil  # recursive file and directory operations
import datetime as dt  # handling timestamps
import random as rd  # generating random numbers for distributions
import pandas as pd  # data mangling and transforming
import numpy as np  # handling vectors and matrices
from gnuper import attributes # making use of the mockup attributes
from random import choices  # to draw from weighted distribution

ImportError: cannot import name 'choices'

## Parameters from MockupAttributes class & random seed

In [None]:
# default values for a small sample set
mua = attributes.MockupAttributes(n_antennas=500, n_users=5000,
                                  max_call_duration=120*60, 
                                  date_format='%Y-%m-%d %H:%M:%S', 
                                  date_window=['2018-10-01', '2018-10-31'],
                                  raw_header=False, location_header=False,
                                  output_path='../sample_data/')

In [None]:
rd.seed(6041)

## Create cells & antennas with specific coordinates

In [None]:
n_cells = mua.n_antennas*mua.n_cells_p_antenna

In [None]:
tow = pd.DataFrame(data={mua.loc_column_names['antenna']: 
                         rd.sample(range(mua.n_antennas, mua.n_antennas*10), mua.n_antennas)*mua.n_cells_p_antenna,
                         mua.loc_column_names['cell']: 
                         rd.sample(range(n_cells, n_cells*10), n_cells)})

In [None]:
tow = pd.merge(tow,
               pd.DataFrame(data={mua.loc_column_names['antenna']: tow[mua.loc_column_names['antenna']].unique(),
                                  mua.loc_column_names['long']: np.random.uniform(mua.long_range[0], 
                                                                                  mua.long_range[1], 
                                                                                  size = mua.n_antennas),
                                  mua.loc_column_names['lat']: np.random.uniform(mua.lat_range[0], 
                                                                                 mua.lat_range[1], 
                                                                                 size = mua.n_antennas)}),
               how='left',
               on=mua.loc_column_names['antenna'])

## Create indicator for Out-Of-Area (OOA) traffic

In [None]:
# create a border of 0.5 around the created towers, remember towers outside of this border
cell_out = tow.query(mua.loc_column_names['long']+'<'+str(mua.long_range[0]+0.5)+'|'+
                    mua.loc_column_names['long']+'>'+str(mua.long_range[1]-0.5)+'|'+
                    mua.loc_column_names['lat']+'<'+str(mua.lat_range[0]+0.5)+'|'+
                    mua.loc_column_names['lat']+'>'+str(mua.lat_range[1]-0.5))[mua.loc_column_names['cell']]\
            .unique()

## Create population of MSISDNs with different activity levels

In [None]:
id_pop = np.round(np.random.uniform(1e8, 1e9, size = mua.n_users), decimals = 2)

In [None]:
weights = np.random.normal(1, 0.2, size = mua.n_users)

In [None]:
weights = np.where(weights < 0, 0, weights) # windsorize

In [None]:
weights = weights/sum(weights)  # standardize

## Create basic outgoing frame

In [None]:
cdr_out = pd.DataFrame(data={mua.raw_column_names['type']: 2,
                             mua.raw_column_names['msisdn']: np.random.choice(
                                 id_pop, p=weights, size=mua.n_total_events, replace=True),
                             mua.raw_column_names['date']: np.random.choice(
                                 pd.date_range(mua.date_window[0]+' 00:00:00',
                                               mua.date_window[1]+' 23:59:59', freq='S'),
                                 size=mua.n_total_events, replace=True),
                             mua.raw_column_names['service']: np.random.randint(1, 3, size = mua.n_total_events),
                             mua.raw_column_names['location']: np.random.choice(
                                 tow[mua.loc_column_names['cell']],
                                 size=mua.n_total_events, replace = True),
                             mua.raw_column_names['partner_type']: choices(
                                 [1,2], [0.99,0.01], k=mua.n_total_events),
                             mua.raw_column_names['partner']: np.random.choice(
                                 id_pop, p=weights, size=mua.n_total_events, replace=True),
                             mua.raw_column_names['tac']: np.random.uniform(
                                 1e6, 1e7, size=mua.n_total_events),
                             mua.raw_column_names['duration']: np.random.binomial(
                                 mua.max_call_duration, 60/mua.max_call_duration, size=mua.n_total_events)})

In [None]:
# create chunk for daily saves (later on)
cdr_out['chunk'] = cdr_out[mua.raw_column_names['date']].dt.strftime('%Y%m%d')

## Create basic incoming frame

In [None]:
cdr_in = pd.DataFrame(data={mua.raw_column_names['type']: 1,
                            mua.raw_column_names['msisdn']: cdr_out[mua.raw_column_names['partner']],
                            mua.raw_column_names['date']: cdr_out[mua.raw_column_names['date']],
                            mua.raw_column_names['service']: cdr_out[mua.raw_column_names['service']],
                            mua.raw_column_names['location']: np.random.choice(
                                tow[mua.loc_column_names['cell']], size=mua.n_total_events, replace=True),
                            mua.raw_column_names['partner_type']: cdr_out[mua.raw_column_names['type']],
                            mua.raw_column_names['partner']: cdr_out[mua.raw_column_names['msisdn']],
                            mua.raw_column_names['tac']: np.random.uniform(1e6, 1e7, size=mua.n_total_events),
                            mua.raw_column_names['duration']: cdr_out[mua.raw_column_names['duration']],
                            'chunk': cdr_out['chunk']})

## Join to one frame

In [None]:
cdr = cdr_in.append(cdr_out)
# release memory
del [cdr_in, cdr_out]

## Postprocessing

In [None]:
# Remove OOA records
cdr = cdr[~cdr[mua.raw_column_names['location']].isin(cell_out)]

In [None]:
# Delete records of self-calling/-texting
cdr = cdr[cdr[mua.raw_column_names['msisdn']] != cdr[mua.raw_column_names['partner']]]

In [None]:
# Set call duration for SMS to NA
cdr[mua.raw_column_names['duration']] = np.where(cdr[mua.raw_column_names['service']]==2, None,
                                                 cdr[mua.raw_column_names['duration']])

## Exporting

### daily CDRs to .csv

In [None]:
days = pd.date_range(mua.date_window[0], mua.date_window[1]).strftime('%Y%m%d')

In [None]:
# delete folder if it exists and create empty new one
if os.path.exists(mua.output_path):
    shutil.rmtree(mua.output_path)
os.makedirs(mua.output_path)

In [None]:
for i in days:
    temp = cdr[cdr['chunk']==i].drop('chunk', axis=1)
    temp.to_csv(mua.output_path+'%s.csv'%i, index=False, header=mua.raw_header)

### tower locations to .csv

In [None]:
tow.to_csv(mua.output_path+mua.loc_file_name, index=False, header=mua.location_header)