In [1]:
import numpy as np
import pandas as pd
import os

os.chdir('../')

## Cells dataframe

In [49]:
# Read file and create list of lines from file

filename = './data/train/Cells.txt'

lines = []
with open(filename) as f:
    for line in f:
        lines.append(line)        

print('Lines processed: {}'.format(len(lines)))

In [142]:
# Data properties
cell_types = ['LTE', 'WCDMA', 'GSM']
lte_columns = ['isRegistered', 'ci', 'MCC', 'MNC', 'PCI', 'TAC', 'asuLevel', 'dBm', 'level']
wcdma_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'PSC', 'asuLevel', 'dBm', 'level']
gsm_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'asuLevel', 'dBm', 'level']
cell_columns = [lte_columns, wcdma_columns, gsm_columns]
col_lengths = [len(lte_columns), len(wcdma_columns), len(gsm_columns)]

In [100]:
# Transform data from lines into structured format

# Create list for transformed data
cells_data = []

# Iterate through lines
for line in lines:    
    line_split_space = line.split(' ')   
    
    # Create dictionary entry
    entry = {'epoch_time': line_split_space[0], 'num_entries': line_split_space[3]}
    
    # Iterate through cell types
    for cell_type, col_len in list(zip(cell_types, col_lengths)):        
    
        # Split line by cell type
        line_split = line.split(cell_type)       
        
        # Create empty list for entries from particular cell type
        items = []
        
        # Iterate through parts of the line (excluding the first one)
        for item in line_split[1:]:
            
            # Split each part by space
            item_split = item.strip().split(' ')           

            items.append(item_split[:col_len])  
        
        # Add data received from particular cell type to the dict
        entry[cell_type] = items     
    
    cells_data.append(entry)

print('Lines processed: {}'.format(len(cells_data)))

1324881


In [217]:
# Transform data from list to dataframe

# Create raw dataframe from list
cells_df = pd.DataFrame(cells_data)

# Create list of dataframes
dfs = []

# Iterate through cell types and corresponding column sets
for cell_type, cols in list(zip(cell_types, cell_columns)):
    
    # Create dataframe for each cell type
    df = cells_df[['epoch_time', 'num_entries', cell_type]]
    
    # Create cell type identifier
    df['cell_type'] = cell_type  
    
    # Explode list of lists and drop na
    df = df.explode(cell_type).dropna()
    
    # Create columns from lists
    df[cols] = pd.DataFrame(df[cell_type].tolist(), index=df.index)
    
    # Drop column which is no longer needed
    df.drop(cell_type, axis=1, inplace=True)
    
    # Append df to the list
    dfs.append(df)

# Concatenate dataframes from different cell types    
cells_df = pd.concat(dfs, axis=0, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type


In [220]:
cells_df.head()

Unnamed: 0,epoch_time,cell_type,isRegistered,ci,MCC,MNC,PCI,TAC,asuLevel,dBm,level,cid,lac,PSC
0,1490430923343,LTE,1,128000386,234,10,20,144,25,-115,2,,,
1,1490430923343,LTE,0,2147483647,234,10,398,2147483647,22,-118,1,,,
2,1490430923343,LTE,0,2147483647,234,10,4,2147483647,16,-124,1,,,
3,1490430923349,LTE,1,128000386,234,10,20,144,25,-115,2,,,
4,1490430923349,LTE,0,2147483647,234,10,398,2147483647,22,-118,1,,,
