In [1]:
import numpy as np
import pandas as pd
import os
import pathlib

os.chdir(pathlib.Path(__file__).parent.joinpath('../'))

In [2]:
# Read file and create list of lines from file

def read_file(filename):
    lines = []
    with open(filename) as f:
        for line in f:
            lines.append(line)        

    print('Lines processed: {}'.format(len(lines)))
    return lines

In [3]:
# Transform data from lines into structured format

def parse_lines(lines):
    # Create list for transformed data
    cells_data = []

    # Iterate through lines
    for line in lines:    
        line_split_space = line.split(' ')   

        # Create dictionary entry
        entry = {'epoch_time': line_split_space[0], 'num_entries': line_split_space[3]}

        # Iterate through cell types
        for cell_type, col_len in list(zip(cell_types, col_lengths)):        

            # Split line by cell type
            line_split = line.split(cell_type)       

            # Create empty list for entries from particular cell type
            items = []

            # Iterate through parts of the line (excluding the first one)
            for item in line_split[1:]:

                # Split each part by space
                item_split = item.strip().split(' ')           

                items.append(item_split[:col_len])  

            # Add data received from particular cell type to the dict
            entry[cell_type] = items     

        cells_data.append(entry)

    print('Lines processed: {}'.format(len(cells_data)))
    return cells_data

In [4]:
# Transform data from list to dataframe

def prepare_dataframe(data):
    # Create raw dataframe from list
    cells_df = pd.DataFrame(data)

    # Create list of dataframes
    dfs = []

    # Iterate through cell types and corresponding column sets
    for cell_type, cols in list(zip(cell_types, cell_columns)):

        # Create dataframe for each cell type
        df = cells_df[['epoch_time', 'num_entries', cell_type]]

        # Create cell type identifier
        df['cell_type'] = cell_type
        #df.loc[:,'cell_type'] = cell_type

        # Explode list of lists and drop na
        df = df.explode(cell_type).dropna()

        # Create columns from lists
        df[cols] = pd.DataFrame(df[cell_type].tolist(), index=df.index)

        # Drop column which is no longer needed
        df.drop(cell_type, axis=1, inplace=True)

        # Append df to the list
        dfs.append(df)

    # Concatenate dataframes from different cell types
    df = pd.concat(dfs, axis=0, ignore_index=True)
    
    # Convert to numeric
    numeric_columns = list(df.columns)
    numeric_columns.remove('cell_type')
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)
    
    print('Dataframe shape: {}'.format(df.shape))    
    
    return df
    

In [5]:
# Full pipeline

def prepare_data(filename):
    return prepare_dataframe(parse_lines(read_file(filename)))
    

In [6]:
# Data properties
cell_types = ['LTE', 'WCDMA', 'GSM']
lte_columns = ['isRegistered', 'ci', 'MCC', 'MNC', 'PCI', 'TAC', 'asuLevel', 'dBm', 'level']
wcdma_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'PSC', 'asuLevel', 'dBm', 'level']
gsm_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'asuLevel', 'dBm', 'level']
cell_columns = [lte_columns, wcdma_columns, gsm_columns]
col_lengths = [len(lte_columns), len(wcdma_columns), len(gsm_columns)]

In [None]:
# Run processing

filename = './data/train/Cells.txt'
df = prepare_data(filename)
print(df.head())
print(df.dtypes)
df.to_parquet('./data/train/Cells.parquet')

filename = './data/validate/Cells.txt'
prepare_data(filename).to_parquet('./data/validate/Cells.parquet')

filename = './data/test/Cells.txt'
prepare_data(filename).to_parquet('./data/test/Cells.parquet')


Lines processed: 1324881
Lines processed: 1324881


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type


Dataframe shape: (4474380, 15)
      epoch_time  num_entries cell_type  isRegistered            ci  MCC  MNC  \
0  1490430923343            3       LTE             1  1.280004e+08  234   10   
1  1490430923343            3       LTE             0  2.147484e+09  234   10   
2  1490430923343            3       LTE             0  2.147484e+09  234   10   
3  1490430923349            3       LTE             1  1.280004e+08  234   10   
4  1490430923349            3       LTE             0  2.147484e+09  234   10   

     PCI           TAC  asuLevel  dBm  level  cid  lac  PSC  
0   20.0  1.440000e+02        25 -115      2  NaN  NaN  NaN  
1  398.0  2.147484e+09        22 -118      1  NaN  NaN  NaN  
2    4.0  2.147484e+09        16 -124      1  NaN  NaN  NaN  
3   20.0  1.440000e+02        25 -115      2  NaN  NaN  NaN  
4  398.0  2.147484e+09        22 -118      1  NaN  NaN  NaN  
epoch_time        int64
num_entries       int64
cell_type        object
isRegistered      int64
ci            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type


Dataframe shape: (333901, 15)
Lines processed: 561369
Lines processed: 561369


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
