diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 8e5481d453..0cfd127dde 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -2464,8 +2464,8 @@ def isnull(self): 1 False True False """ kdf = self.copy() - for name, ks in kdf.iteritems(): - kdf[name] = ks.isnull() + for name, kser in kdf.iteritems(): + kdf[name] = kser.isnull() return kdf isna = isnull @@ -2499,8 +2499,8 @@ def notnull(self): 1 True False True """ kdf = self.copy() - for name, ks in kdf.iteritems(): - kdf[name] = ks.notnull() + for name, kser in kdf.iteritems(): + kdf[name] = kser.notnull() return kdf notna = notnull diff --git a/databricks/koalas/generic.py b/databricks/koalas/generic.py index 083ee31fc0..4f8fa92fe5 100644 --- a/databricks/koalas/generic.py +++ b/databricks/koalas/generic.py @@ -1525,17 +1525,17 @@ def _resolve_col(kdf, col_like): raise ValueError(col_like) -def _spark_col_apply(kdf_or_ks, sfun): +def _spark_col_apply(kdf_or_kser, sfun): """ Performs a function to all cells on a dataframe, the function being a known sql function. """ from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series - if isinstance(kdf_or_ks, Series): - ks = kdf_or_ks - return ks._with_new_scol(sfun(ks._scol)) - assert isinstance(kdf_or_ks, DataFrame) - kdf = kdf_or_ks + if isinstance(kdf_or_kser, Series): + kser = kdf_or_kser + return kser._with_new_scol(sfun(kser._scol)) + assert isinstance(kdf_or_kser, DataFrame) + kdf = kdf_or_kser sdf = kdf._sdf sdf = sdf.select([sfun(kdf._internal.scol_for(col)).alias(col) for col in kdf.columns]) return DataFrame(sdf) diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py index e7255c1ad0..d0de43083c 100644 --- a/databricks/koalas/groupby.py +++ b/databricks/koalas/groupby.py @@ -702,7 +702,7 @@ def cumprod(scol): # # This is a bit hacky. Maybe we should fix it. - return_type = self._ks._internal.spark_type_for(self._ks._internal.column_index[0]) + return_type = self._kser._internal.spark_type_for(self._kser._internal.column_index[0]) @pandas_udf(returnType=return_type) def negative_check(s): @@ -1143,13 +1143,13 @@ def idxmax(self, skipna=True): index = self._kdf._internal.index_columns[0] stat_exprs = [] - for ks in self._agg_columns: - name = ks._internal.data_columns[0] + for kser in self._agg_columns: + name = kser._internal.data_columns[0] if skipna: - order_column = Column(ks._scol._jc.desc_nulls_last()) + order_column = Column(kser._scol._jc.desc_nulls_last()) else: - order_column = Column(ks._scol._jc.desc_nulls_first()) + order_column = Column(kser._scol._jc.desc_nulls_first()) window = Window.partitionBy(groupkey_cols).orderBy(order_column) sdf = sdf.withColumn(name, F.when(F.row_number().over(window) == 1, scol_for(sdf, index)) @@ -1160,10 +1160,10 @@ def idxmax(self, skipna=True): index_map=[(SPARK_INDEX_NAME_FORMAT(i), s._internal.column_index[0]) for i, s in enumerate(groupkeys)], - column_index=[ks._internal.column_index[0] - for ks in self._agg_columns], - column_scols=[scol_for(sdf, ks._internal.data_columns[0]) - for ks in self._agg_columns]) + column_index=[kser._internal.column_index[0] + for kser in self._agg_columns], + column_scols=[scol_for(sdf, kser._internal.data_columns[0]) + for kser in self._agg_columns]) return DataFrame(internal) # TODO: add axis parameter @@ -1213,13 +1213,13 @@ def idxmin(self, skipna=True): index = self._kdf._internal.index_columns[0] stat_exprs = [] - for ks in self._agg_columns: - name = ks._internal.data_columns[0] + for kser in self._agg_columns: + name = kser._internal.data_columns[0] if skipna: - order_column = Column(ks._scol._jc.asc_nulls_last()) + order_column = Column(kser._scol._jc.asc_nulls_last()) else: - order_column = Column(ks._scol._jc.asc_nulls_first()) + order_column = Column(kser._scol._jc.asc_nulls_first()) window = Window.partitionBy(groupkey_cols).orderBy(order_column) sdf = sdf.withColumn(name, F.when(F.row_number().over(window) == 1, scol_for(sdf, index)) @@ -1230,10 +1230,10 @@ def idxmin(self, skipna=True): index_map=[(SPARK_INDEX_NAME_FORMAT(i), s._internal.column_index[0]) for i, s in enumerate(groupkeys)], - column_index=[ks._internal.column_index[0] - for ks in self._agg_columns], - column_scols=[scol_for(sdf, ks._internal.data_columns[0]) - for ks in self._agg_columns]) + column_index=[kser._internal.column_index[0] + for kser in self._agg_columns], + column_scols=[scol_for(sdf, kser._internal.data_columns[0]) + for kser in self._agg_columns]) return DataFrame(internal) def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None): @@ -1713,20 +1713,20 @@ def _reduce_for_stat_function(self, sfun, only_numeric): column_index = [] if len(self._agg_columns) > 0: stat_exprs = [] - for ks in self._agg_columns: - spark_type = ks.spark_type - name = ks._internal.data_columns[0] - idx = ks._internal.column_index[0] + for kser in self._agg_columns: + spark_type = kser.spark_type + name = kser._internal.data_columns[0] + idx = kser._internal.column_index[0] # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance(spark_type, FloatType): - stat_exprs.append(sfun(F.nanvl(ks._scol, F.lit(None))).alias(name)) + stat_exprs.append(sfun(F.nanvl(kser._scol, F.lit(None))).alias(name)) data_columns.append(name) column_index.append(idx) elif isinstance(spark_type, NumericType) or not only_numeric: - stat_exprs.append(sfun(ks._scol).alias(name)) + stat_exprs.append(sfun(kser._scol).alias(name)) data_columns.append(name) column_index.append(idx) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) @@ -1875,8 +1875,8 @@ def _shift(self, periods, fill_value): class SeriesGroupBy(GroupBy): - def __init__(self, ks: Series, by: List[Series], as_index: bool = True): - self._ks = ks + def __init__(self, kser: Series, by: List[Series], as_index: bool = True): + self._kser = kser self._groupkeys = by if not as_index: raise TypeError('as_index=False only valid with DataFrame') @@ -1894,31 +1894,31 @@ def __getattr__(self, item: str) -> Any: def _diff(self, *args, **kwargs): groupkey_scols = [s._scol for s in self._groupkeys] - return Series._diff(self._ks, *args, **kwargs, part_cols=groupkey_scols) + return Series._diff(self._kser, *args, **kwargs, part_cols=groupkey_scols) def _cum(self, func): groupkey_scols = [s._scol for s in self._groupkeys] - return Series._cum(self._ks, func, True, part_cols=groupkey_scols) + return Series._cum(self._kser, func, True, part_cols=groupkey_scols) def _rank(self, *args, **kwargs): groupkey_scols = [s._scol for s in self._groupkeys] - return Series._rank(self._ks, *args, **kwargs, part_cols=groupkey_scols) + return Series._rank(self._kser, *args, **kwargs, part_cols=groupkey_scols) def _fillna(self, *args, **kwargs): groupkey_scols = [s._scol for s in self._groupkeys] - return Series._fillna(self._ks, *args, **kwargs, part_cols=groupkey_scols) + return Series._fillna(self._kser, *args, **kwargs, part_cols=groupkey_scols) def _shift(self, periods, fill_value): groupkey_scols = [s._scol for s in self._groupkeys] - return Series._shift(self._ks, periods, fill_value, part_cols=groupkey_scols) + return Series._shift(self._kser, periods, fill_value, part_cols=groupkey_scols) @property def _kdf(self) -> DataFrame: - return self._ks._kdf + return self._kser._kdf @property def _agg_columns(self): - return [self._ks] + return [self._kser] def _reduce_for_stat_function(self, sfun, only_numeric): return _col(super(SeriesGroupBy, self)._reduce_for_stat_function(sfun, only_numeric)) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index a3d24d798c..cdbf610716 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -122,26 +122,26 @@ class AtIndexer(object): >>> kdf.at[5, 'B'] array([ 4, 20]) """ - def __init__(self, df_or_s): + def __init__(self, kdf_or_kser): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series - assert isinstance(df_or_s, (DataFrame, Series)), \ - 'unexpected argument type: {}'.format(type(df_or_s)) - self._df_or_s = df_or_s + assert isinstance(kdf_or_kser, (DataFrame, Series)), \ + 'unexpected argument type: {}'.format(type(kdf_or_kser)) + self._kdf_or_kser = kdf_or_kser @property def _is_df(self): from databricks.koalas.frame import DataFrame - return isinstance(self._df_or_s, DataFrame) + return isinstance(self._kdf_or_kser, DataFrame) @property def _is_series(self): from databricks.koalas.series import Series - return isinstance(self._df_or_s, Series) + return isinstance(self._kdf_or_kser, Series) @property def _internal(self): - return self._df_or_s._internal + return self._kdf_or_kser._internal def __getitem__(self, key): if self._is_df: @@ -149,7 +149,7 @@ def __getitem__(self, key): raise TypeError("Use DataFrame.at like .at[row_index, column_name]") row_sel, col_sel = key else: - assert self._is_series, type(self._df_or_s) + assert self._is_series, type(self._kdf_or_kser) if not isinstance(key, str) and len(key) != 1: raise TypeError("Use Series.at like .at[row_index]") row_sel = key @@ -357,19 +357,19 @@ class LocIndexer(object): 9 7 8 """ - def __init__(self, df_or_s): + def __init__(self, kdf_or_kser): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series - assert isinstance(df_or_s, (DataFrame, Series)), \ - 'unexpected argument type: {}'.format(type(df_or_s)) - if isinstance(df_or_s, DataFrame): - self._kdf = df_or_s - self._ks = None + assert isinstance(kdf_or_kser, (DataFrame, Series)), \ + 'unexpected argument type: {}'.format(type(kdf_or_kser)) + if isinstance(kdf_or_kser, DataFrame): + self._kdf = kdf_or_kser + self._kser = None else: # If df_or_col is Column, store both the DataFrame anchored to the Column and # the Column itself. - self._kdf = df_or_s._kdf - self._ks = df_or_s + self._kdf = kdf_or_kser._kdf + self._kser = kdf_or_kser def __getitem__(self, key): from databricks.koalas.frame import DataFrame @@ -381,7 +381,7 @@ def raiseNotImplemented(description): pandas_function=".loc[..., ...]", spark_target_function="select, where") - rows_sel, cols_sel = _unfold(key, self._ks) + rows_sel, cols_sel = _unfold(key, self._kser) sdf = self._kdf._sdf if isinstance(rows_sel, Series): @@ -665,19 +665,19 @@ class ILocIndexer(object): 2 1000 3000 """ - def __init__(self, df_or_s): + def __init__(self, kdf_or_kser): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series - assert isinstance(df_or_s, (DataFrame, Series)), \ - 'unexpected argument type: {}'.format(type(df_or_s)) - if isinstance(df_or_s, DataFrame): - self._kdf = df_or_s - self._ks = None + assert isinstance(kdf_or_kser, (DataFrame, Series)), \ + 'unexpected argument type: {}'.format(type(kdf_or_kser)) + if isinstance(kdf_or_kser, DataFrame): + self._kdf = kdf_or_kser + self._kser = None else: # If df_or_col is Column, store both the DataFrame anchored to the Column and # the Column itself. - self._kdf = df_or_s._kdf - self._ks = df_or_s + self._kdf = kdf_or_kser._kdf + self._kser = kdf_or_kser def __getitem__(self, key): from databricks.koalas.frame import DataFrame @@ -690,7 +690,7 @@ def raiseNotImplemented(description): pandas_function=".iloc[..., ...]", spark_target_function="select, where") - rows_sel, cols_sel = _unfold(key, self._ks) + rows_sel, cols_sel = _unfold(key, self._kser) sdf = self._kdf._sdf if isinstance(rows_sel, Index): diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 526a6d2f2e..4c42e4dc59 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -854,7 +854,7 @@ def test_merge(self): left_kdf = ks.from_pandas(left_pdf) right_kdf = ks.from_pandas(right_pdf) - right_ks = ks.from_pandas(right_ps) + right_kser = ks.from_pandas(right_ps) def check(op, right_kdf=right_kdf, right_pdf=right_pdf): k_res = op(left_kdf, right_kdf) @@ -900,22 +900,22 @@ def check(op, right_kdf=right_kdf, right_pdf=right_pdf): # Test Series on the right # pd.DataFrame.merge with Series is implemented since version 0.24.0 if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): - check(lambda left, right: left.merge(right), right_ks, right_ps) + check(lambda left, right: left.merge(right), right_kser, right_ps) check(lambda left, right: left.merge(right, left_on='x', right_on='x'), - right_ks, right_ps) + right_kser, right_ps) check(lambda left, right: left.set_index('x').merge(right, left_index=True, - right_on='x'), right_ks, right_ps) + right_on='x'), right_kser, right_ps) # Test join types with Series for how in ['inner', 'left', 'right', 'outer']: - check(lambda left, right: left.merge(right, how=how), right_ks, right_ps) + check(lambda left, right: left.merge(right, how=how), right_kser, right_ps) check(lambda left, right: left.merge(right, left_on='x', right_on='x', how=how), - right_ks, right_ps) + right_kser, right_ps) # suffix with Series check(lambda left, right: left.merge(right, suffixes=['_left', '_right'], how='outer', left_index=True, right_index=True), - right_ks, right_ps) + right_kser, right_ps) # multi-index columns left_columns = pd.MultiIndex.from_tuples([('a', 'lkey'), ('a', 'value'), ('b', 'x')]) diff --git a/databricks/koalas/tests/test_frame_plot.py b/databricks/koalas/tests/test_frame_plot.py index ca8efcfe4f..998e6e8640 100644 --- a/databricks/koalas/tests/test_frame_plot.py +++ b/databricks/koalas/tests/test_frame_plot.py @@ -422,14 +422,14 @@ def check_kde_plot(pdf, kdf, *args, **kwargs): check_kde_plot(pdf1, kdf1, ind=[1, 2, 3], bw_method=3.0) def test_missing(self): - ks = self.kdf1 + kser = self.kdf1 unsupported_functions = ['box', 'hexbin'] for name in unsupported_functions: with self.assertRaisesRegex(PandasNotImplementedError, "method.*DataFrame.*{}.*not implemented".format(name)): - getattr(ks.plot, name)() + getattr(kser.plot, name)() def test_topn_max_rows(self): diff --git a/databricks/koalas/tests/test_reshape.py b/databricks/koalas/tests/test_reshape.py index 32b9a05ce6..5dfc45a8e7 100644 --- a/databricks/koalas/tests/test_reshape.py +++ b/databricks/koalas/tests/test_reshape.py @@ -35,9 +35,9 @@ def test_get_dummies(self): pd.DataFrame({'a': [1, 2, 3, 4, 4, 3, 2, 1], # 'b': pd.Categorical(list('abcdabcd')), 'b': list('abcdabcd')})]: - kdf_or_ks = ks.from_pandas(pdf_or_ps) + kdf_or_kser = ks.from_pandas(pdf_or_ps) - self.assert_eq(ks.get_dummies(kdf_or_ks), pd.get_dummies(pdf_or_ps), almost=True) + self.assert_eq(ks.get_dummies(kdf_or_kser), pd.get_dummies(pdf_or_ps), almost=True) def test_get_dummies_object(self): pdf = pd.DataFrame({'a': [1, 2, 3, 4, 4, 3, 2, 1], diff --git a/databricks/koalas/window.py b/databricks/koalas/window.py index 61c015846d..49e8028e52 100644 --- a/databricks/koalas/window.py +++ b/databricks/koalas/window.py @@ -635,7 +635,7 @@ def __init__(self, groupby, groupkeys, window, min_periods=None): from databricks.koalas.groupby import DataFrameGroupBy if isinstance(groupby, SeriesGroupBy): - kdf = groupby._ks.to_frame() + kdf = groupby._kser.to_frame() elif isinstance(groupby, DataFrameGroupBy): kdf = groupby._kdf else: @@ -1412,7 +1412,7 @@ def __init__(self, groupby, groupkeys, min_periods=1): from databricks.koalas.groupby import DataFrameGroupBy if isinstance(groupby, SeriesGroupBy): - kdf = groupby._ks.to_frame() + kdf = groupby._kser.to_frame() elif isinstance(groupby, DataFrameGroupBy): kdf = groupby._kdf else: