In [1]:
import numpy as np
import pandas as pd
import os

os.chdir('../')

## Cells dataframe

In [2]:
# Read file and create list of lines from file

filename = './data/train/Cells.txt'

lines = []
with open(filename) as f:
    for line in f:
        lines.append(line)        

print('Lines processed: {}'.format(len(lines)))

Lines processed: 1324881


In [3]:
# Data properties
cell_types = ['LTE', 'WCDMA', 'GSM']
lte_columns = ['isRegistered', 'ci', 'MCC', 'MNC', 'PCI', 'TAC', 'asuLevel', 'dBm', 'level']
wcdma_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'PSC', 'asuLevel', 'dBm', 'level']
gsm_columns = ['isRegistered', 'cid', 'lac', 'MCC', 'MNC', 'asuLevel', 'dBm', 'level']
cell_columns = [lte_columns, wcdma_columns, gsm_columns]
col_lengths = [len(lte_columns), len(wcdma_columns), len(gsm_columns)]

In [4]:
# Transform data from lines into structured format

# Create list for transformed data
cells_data = []

# Iterate through lines
for line in lines:    
    line_split_space = line.split(' ')   
    
    # Create dictionary entry
    entry = {'epoch_time': line_split_space[0], 'num_entries': line_split_space[3]}
    
    # Iterate through cell types
    for cell_type, col_len in list(zip(cell_types, col_lengths)):        
    
        # Split line by cell type
        line_split = line.split(cell_type)       
        
        # Create empty list for entries from particular cell type
        items = []
        
        # Iterate through parts of the line (excluding the first one)
        for item in line_split[1:]:
            
            # Split each part by space
            item_split = item.strip().split(' ')           

            items.append(item_split[:col_len])  
        
        # Add data received from particular cell type to the dict
        entry[cell_type] = items     
    
    cells_data.append(entry)

print('Lines processed: {}'.format(len(cells_data)))

Lines processed: 1324881


In [5]:
# Transform data from list to dataframe

# Create raw dataframe from list
cells_df = pd.DataFrame(cells_data)

# Create list of dataframes
dfs = []

# Iterate through cell types and corresponding column sets
for cell_type, cols in list(zip(cell_types, cell_columns)):
    
    # Create dataframe for each cell type
    df = cells_df[['epoch_time', 'num_entries', cell_type]]
    
    # Create cell type identifier
    df['cell_type'] = cell_type  
    
    # Explode list of lists and drop na
    df = df.explode(cell_type).dropna()
    
    # Create columns from lists
    df[cols] = pd.DataFrame(df[cell_type].tolist(), index=df.index)
    
    # Drop column which is no longer needed
    df.drop(cell_type, axis=1, inplace=True)
    
    # Append df to the list
    dfs.append(df)

# Concatenate dataframes from different cell types    
cells_df = pd.concat(dfs, axis=0, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cell_type'] = cell_type


In [6]:
cells_df.head()

Unnamed: 0,epoch_time,num_entries,cell_type,isRegistered,ci,MCC,MNC,PCI,TAC,asuLevel,dBm,level,cid,lac,PSC
0,1490430923343,3,LTE,1,128000386,234,10,20,144,25,-115,2,,,
1,1490430923343,3,LTE,0,2147483647,234,10,398,2147483647,22,-118,1,,,
2,1490430923343,3,LTE,0,2147483647,234,10,4,2147483647,16,-124,1,,,
3,1490430923349,3,LTE,1,128000386,234,10,20,144,25,-115,2,,,
4,1490430923349,3,LTE,0,2147483647,234,10,398,2147483647,22,-118,1,,,


In [7]:
cells_df.shape

(4474380, 15)

In [8]:
# Save prepared data to csv
cells_df.to_csv('./data/train/Cells.csv', index=False)

In [9]:
cells_df.cell_type.value_counts()

LTE      3035275
GSM       933211
WCDMA     505894
Name: cell_type, dtype: int64

In [10]:
cells_df.isRegistered.value_counts()

0    3237995
1    1236385
Name: isRegistered, dtype: int64

In [11]:
cells_df.MCC.value_counts()

234    4472966
208       1414
Name: MCC, dtype: int64

In [12]:
cells_df.MNC.value_counts()

10    4446064
30      16588
15       9070
20       2062
1         440
33        156
Name: MNC, dtype: int64

In [13]:
cells_df.PCI.value_counts()

20     247926
398    153692
4      138807
169     80231
170     64335
        ...  
123        62
497        52
499        38
496        10
501         6
Name: PCI, Length: 503, dtype: int64

In [14]:
cells_df.TAC.value_counts()

2147483647    2024952
144            635863
128             39402
16              39401
17104           38272
               ...   
6190                2
1401                2
5160                2
5164                2
5109                2
Name: TAC, Length: 97, dtype: int64

In [15]:
cells_df.asuLevel.value_counts()

0     320995
1     161461
2     161392
3     135512
4     117184
       ...  
92        12
93        10
91        10
95         4
96         4
Name: asuLevel, Length: 97, dtype: int64

In [16]:
cells_df.dBm.value_counts()

-113    404626
-109    259309
-111    256484
-107    227129
-105    205832
         ...  
-47         10
-49         10
-139         4
-45          4
-44          4
Name: dBm, Length: 97, dtype: int64

In [17]:
cells_df.level.value_counts()

2    1196462
3     963973
1     891425
4     782778
0     639742
Name: level, dtype: int64